Bike Analysis and Prediction

The following data is daily bike rentals joined with weather data. The goal of my analysis is to predict the number of unregistered users.

day.original = read.csv('day.csv')
day = day.original
#Fixing these values, they were illogical considering the other values for the days immediately surrounding them. 
#For the humidity, I found days that were similar in temperature and time of year, and also used values from the surrounding days. 
#For the atemp values, I calculated a temp/atemp ratio from the previous day, and the following day, and used that to ratio to set the atemp value.
day$hum[69] = mean(day$hum[c(26, 90, 106, 407, 478, 68, 68, 70, 70)])
day$atemp[595] = day$temp[595] * mean(c(day$atemp[594]/day$temp[594], day$atemp[596]/day$temp[596]))

Feature creation and data manipulation:

#factor which is basically an interaction term between mnth and workingday. This separates out the weekend from the weekday for each month, creating a total of  24 levels, with each level being of the form Jan-Weekday, Jan-Weekend, Feb-Weekday, etc.
day$mnthWeekend = as.factor(day$mnth * 2 + day$workingday)

day$season = as.factor(day$season)
day$mnth = as.factor(day$mnth)
day$weekday = as.factor(day$weekday)

day$hum = abs(mean(day$hum) - day$hum)

day$weekend = 0
day$weekend[which(day$weekday == 5)] = 1
day$weekend[which(day$weekday == 6)] = 2
day$weekend[which(day$weekday == 0)] = 3
day$weekend = as.factor(day$weekend)
day$workingday = as.factor(day$workingday)

#Creating Separate Holiday factors. This helps for some of the models I will use, while the other models will fail with it on.
#day$holiday[which((day$mnth == 4) & (day$holiday == 1))] = 4
#day$holiday[which((day$mnth == 7) & (day$holiday == 1))] = 7
#day$holiday[which((day$mnth ==10) & (day$holiday == 1))] = 10
#day$holiday[which((day$mnth == 11) & (day$holiday == 1))] = 11
#day$holiday = as.factor(day$holiday)
day$holidayseasons = 0
day$holidayseasons[which((day$mnth == 4) & (day$holiday == 1))] = 4
day$holidayseasons[which((day$mnth == 7) & (day$holiday == 1))] = 7
day$holidayseasons[which((day$mnth ==10) & (day$holiday == 1))] = 10
day$holidayseasons[which((day$mnth == 11) & (day$holiday == 1))] = 11

day$holidayseasons = as.factor(day$holidayseasons)



# previous days and weeks bike rentals, both casual bike rentals and the full count of caual + registered members
day$previousCasual = 0
day$previousCasual[2:731] = day$casual[1:730]
day$previousweekCasual = 0
day$previousweekCasual[8:731] = day$casual[1:724]

day$previousAll = 0
day$previousAll[2:731] = day$cnt[1:730]
day$previousweekAll = 0
day$previousweekAll[8:731] = day$cnt[1:724]

day$weathersit = as.factor(day$weathersit)

# This week factor is mostly useful for plotting
day$week = 0

i = 0
for(x in 2:360)
{
  if (day$weekday[x] == 0) {
    i = i +1
    day$week[x] = i
    day$week[x+371] = i }
  else 
  {
      day$week[x] = i
      day$week[x+371] = i }
}

day$week[361:365] = 52

# weeks which have substantially lower bike rentals
day$slowseason = 0
day$slowseason[which(day$week >=0 & day$week < 10)] = 1
day$slowseason[which(day$week >48)] = 1
day$slowseason = as.factor(day$slowseason)

#day$week = as.factor(day$week)

#ratio of yesterdays temperature to todays
day$temprat = 0
day$temprat[8:731] = day$atemp[8:731] / day$atemp[1:724]

#Temperature ratio as distance from 1

day$tempratAbs = abs(day$temprat -1)



#Average users per month on a weekend vs weekday basis
day$mnthAvgCasual = 0
for(x in 2:25)
{
  day$mnthAvgCasual[which(day$mnthWeekend==x)] = mean(day$casual[which(day$mnthWeekend == x )])
}
day$mnthAvgCasualF = as.factor(day$mnthAvgCasual)

day$mnthAvgAll = 0
for(x in 2:25)
{
  day$mnthAvgAll[which(day$mnthWeekend==x)] = mean(day$cnt[which(day$mnthWeekend == x )])
}

#The absolute value of Todays temperature divided by the average temperature for the month, minus 1.
day$mnthTempAvg = 0
for(x in 1:12)
{
  day$mnthTempAvg[which(day$mnth==x)] = mean(day$atemp[which(day$mnth == x)])
}

day$mnthTempAvg = abs(1- day$atemp/ day$mnthTempAvg)


day$mnthHumAvg = 0
for(x in 1:12)
{
  day$mnthHumAvg[which(day$mnth==x)] = mean(day$hum[which(day$mnth == x)])
}
day$mnthHumAvg = abs(1- day$hum/ day$mnthHumAvg)



day$seasonTempAvg = 0
for(x in 1:4)
{
  day$seasonTempAvg[which(day$season ==x)] = mean(day$atemp[which(day$season == x)])
}

day$seasonWeekend = as.factor(as.numeric(day$season) * 2 + as.numeric(day$workingday))
day$seasonAvgCasual = 0
for(x in 3:12)
{
  day$seasonAvgCasual[which(day$seasonWeekend==x)] = mean(day$casual[which(day$seasonWeekend == x )])
}
day$seasonAvgCasual = as.factor(day$seasonAvgCasual)

library(viridis)

## Loading required package: viridisLite

cols.heat = viridis(3410, begin=.1, end=.9, direction = -1)
day.cols = rep(0, times = 731)
for(x in 1: 731)
{
  day.cols[x] = cols.heat[day$casual[x]]
}
#backup
daycopy = day

library(boot)
library(splines)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:boot':
## 
##     logit

cv.error=0
excluded=-which(day$casual<20)

formula.weather =  (casual ~   yr:weekday+ yr:mnthAvgCasual:slowseason +season:weekday:windspeed + mnth +  mnthWeekend:temp  +  previousCasual:weathersit + previousweekCasual:weathersit:hum +  holiday:holidayseasons + seasonWeekend:tempratAbs:hum:windspeed)


#(casual ~ yr:weekday  +  yr:mnthAvgCasual:slowseason + season:weekday + mnth +  mnth:weekend:atemp + hum:weekend + windspeed:weekend +  previousCasual + previousweekCasual:weathersit:hum + holiday + season:temprat  + previousAll:previousCasual)
#(casual ~   yr:seasonWeekend + mnthAvgCasual:weathersit + season+ season:weekend:atemp + hum:windspeed:weekend + weathersit:previousCasual +  holiday)

glm.1 = glm(formula.weather, data=day)
summary(glm.1)

## 
## Call:
## glm(formula = formula.weather, data = day)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -937.10  -104.82    -6.33   103.54  1043.63  
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               1.292e+02  9.082e+01   1.423 0.155311
## mnth2                                    -1.386e+02  1.466e+02  -0.945 0.344841
## mnth3                                    -5.076e+02  1.446e+02  -3.511 0.000478
## mnth4                                    -1.519e+02  1.914e+02  -0.793 0.427908
## mnth5                                     3.545e+02  2.578e+02   1.375 0.169689
## mnth6                                     1.565e+03  3.350e+02   4.672 3.64e-06
## mnth7                                     4.709e+03  4.540e+02  10.372  < 2e-16
## mnth8                                     7.696e+02  5.151e+02   1.494 0.135620
## mnth9                                     1.044e+03  3.117e+02   3.348 0.000861
## mnth10                                   -2.526e+02  2.106e+02  -1.199 0.230871
## mnth11                                   -4.505e+02  2.042e+02  -2.206 0.027742
## mnth12                                   -7.012e+01  1.716e+02  -0.409 0.683020
## yr:weekday0                               2.638e+02  8.138e+01   3.241 0.001252
## yr:weekday1                               9.444e+01  5.602e+01   1.686 0.092294
## yr:weekday2                               1.334e+02  5.352e+01   2.493 0.012907
## yr:weekday3                               1.267e+02  5.126e+01   2.471 0.013720
## yr:weekday4                               1.738e+02  5.149e+01   3.376 0.000781
## yr:weekday5                               3.137e+02  5.236e+01   5.992 3.46e-09
## yr:weekday6                               5.154e+02  8.028e+01   6.421 2.64e-10
## mnthWeekend2:temp                         1.693e+03  3.938e+02   4.298 1.99e-05
## mnthWeekend3:temp                         1.411e+02  3.779e+02   0.373 0.708902
## mnthWeekend4:temp                         2.724e+03  4.704e+02   5.790 1.10e-08
## mnthWeekend5:temp                         7.114e+02  3.757e+02   1.894 0.058738
## mnthWeekend6:temp                         4.991e+03  3.548e+02  14.068  < 2e-16
## mnthWeekend7:temp                         2.273e+03  2.886e+02   7.875 1.47e-14
## mnthWeekend8:temp                         3.684e+03  4.094e+02   8.999  < 2e-16
## mnthWeekend9:temp                         1.328e+03  3.517e+02   3.775 0.000175
## mnthWeekend10:temp                        2.313e+03  4.195e+02   5.515 5.08e-08
## mnthWeekend11:temp                        4.149e+02  4.053e+02   1.024 0.306398
## mnthWeekend12:temp                       -2.643e+01  4.754e+02  -0.056 0.955676
## mnthWeekend13:temp                       -1.390e+03  4.627e+02  -3.004 0.002771
## mnthWeekend14:temp                       -4.174e+03  5.823e+02  -7.169 2.10e-12
## mnthWeekend15:temp                       -5.325e+03  5.825e+02  -9.142  < 2e-16
## mnthWeekend16:temp                        1.095e+03  7.392e+02   1.481 0.139004
## mnthWeekend17:temp                       -2.541e+01  7.187e+02  -0.035 0.971808
## mnthWeekend18:temp                        1.266e+03  4.832e+02   2.620 0.009007
## mnthWeekend19:temp                       -7.228e+02  4.884e+02  -1.480 0.139442
## mnthWeekend20:temp                        4.364e+03  4.455e+02   9.796  < 2e-16
## mnthWeekend21:temp                        1.549e+03  3.739e+02   4.142 3.90e-05
## mnthWeekend22:temp                        4.185e+03  5.193e+02   8.059 3.80e-15
## mnthWeekend23:temp                        1.849e+03  4.836e+02   3.823 0.000145
## mnthWeekend24:temp                        2.280e+03  5.153e+02   4.424 1.14e-05
## mnthWeekend25:temp                        7.846e+02  4.490e+02   1.747 0.081052
## previousCasual:weathersit1                6.913e-02  2.583e-02   2.676 0.007641
## previousCasual:weathersit2                5.524e-02  2.818e-02   1.960 0.050404
## previousCasual:weathersit3               -1.077e-01  1.405e-01  -0.767 0.443587
## holiday:holidayseasons0                  -3.484e+02  9.445e+01  -3.689 0.000245
## holiday:holidayseasons4                  -1.285e+03  2.058e+02  -6.244 7.80e-10
## holiday:holidayseasons7                   9.987e+02  1.794e+02   5.566 3.84e-08
## holiday:holidayseasons10                 -9.049e+02  1.838e+02  -4.922 1.09e-06
## holiday:holidayseasons11                 -6.661e+02  1.524e+02  -4.370 1.45e-05
## yr:mnthAvgCasual:slowseason0              1.022e-01  4.345e-02   2.353 0.018944
## yr:mnthAvgCasual:slowseason1             -3.700e-01  1.215e-01  -3.046 0.002412
## weekday0:season1:windspeed               -1.895e+03  3.107e+02  -6.098 1.86e-09
## weekday1:season1:windspeed               -3.400e+02  2.850e+02  -1.193 0.233294
## weekday2:season1:windspeed               -4.334e+02  2.918e+02  -1.485 0.137934
## weekday3:season1:windspeed               -3.996e+02  2.574e+02  -1.552 0.121114
## weekday4:season1:windspeed               -6.439e+02  2.742e+02  -2.348 0.019157
## weekday5:season1:windspeed               -5.127e+02  2.475e+02  -2.071 0.038725
## weekday6:season1:windspeed               -2.440e+03  3.344e+02  -7.298 8.70e-13
## weekday0:season2:windspeed               -1.037e+01  4.696e+02  -0.022 0.982388
## weekday1:season2:windspeed               -3.896e+02  2.866e+02  -1.359 0.174486
## weekday2:season2:windspeed               -7.255e+02  2.733e+02  -2.655 0.008134
## weekday3:season2:windspeed               -6.648e+02  2.926e+02  -2.273 0.023383
## weekday4:season2:windspeed               -2.631e+02  2.770e+02  -0.950 0.342525
## weekday5:season2:windspeed                4.346e+02  3.209e+02   1.354 0.176079
## weekday6:season2:windspeed               -1.856e+02  4.495e+02  -0.413 0.679838
## weekday0:season3:windspeed               -1.050e+03  4.550e+02  -2.307 0.021347
## weekday1:season3:windspeed               -9.160e+02  3.748e+02  -2.444 0.014789
## weekday2:season3:windspeed               -7.341e+02  3.649e+02  -2.012 0.044655
## weekday3:season3:windspeed               -9.211e+02  4.016e+02  -2.293 0.022144
## weekday4:season3:windspeed               -7.099e+02  3.592e+02  -1.976 0.048548
## weekday5:season3:windspeed               -1.250e+02  3.952e+02  -0.316 0.751927
## weekday6:season3:windspeed               -6.032e+02  4.187e+02  -1.441 0.150180
## weekday0:season4:windspeed               -2.850e+03  4.113e+02  -6.930 1.03e-11
## weekday1:season4:windspeed               -5.202e+02  3.746e+02  -1.389 0.165415
## weekday2:season4:windspeed               -6.115e+02  3.390e+02  -1.804 0.071772
## weekday3:season4:windspeed               -6.525e+02  3.236e+02  -2.016 0.044167
## weekday4:season4:windspeed               -5.526e+02  2.865e+02  -1.929 0.054204
## weekday5:season4:windspeed                9.455e+01  3.269e+02   0.289 0.772487
## weekday6:season4:windspeed               -1.273e+03  4.149e+02  -3.068 0.002245
## weathersit1:previousweekCasual:hum        4.550e-01  1.259e-01   3.614 0.000325
## weathersit2:previousweekCasual:hum       -1.498e+00  1.611e-01  -9.298  < 2e-16
## weathersit3:previousweekCasual:hum       -2.067e+00  5.096e-01  -4.056 5.60e-05
## windspeed:hum:seasonWeekend3:tempratAbs   4.801e+03  1.610e+03   2.981 0.002982
## windspeed:hum:seasonWeekend4:tempratAbs   7.881e+02  1.849e+03   0.426 0.670152
## windspeed:hum:seasonWeekend5:tempratAbs  -1.667e+04  6.898e+03  -2.416 0.015965
## windspeed:hum:seasonWeekend6:tempratAbs  -7.259e+02  2.425e+03  -0.299 0.764755
## windspeed:hum:seasonWeekend7:tempratAbs  -4.181e+04  2.012e+04  -2.077 0.038168
## windspeed:hum:seasonWeekend8:tempratAbs   2.562e+04  1.394e+04   1.838 0.066462
## windspeed:hum:seasonWeekend9:tempratAbs   1.419e+03  6.180e+03   0.230 0.818473
## windspeed:hum:seasonWeekend10:tempratAbs -2.043e+03  5.774e+03  -0.354 0.723648
##                                             
## (Intercept)                                 
## mnth2                                       
## mnth3                                    ***
## mnth4                                       
## mnth5                                       
## mnth6                                    ***
## mnth7                                    ***
## mnth8                                       
## mnth9                                    ***
## mnth10                                      
## mnth11                                   *  
## mnth12                                      
## yr:weekday0                              ** 
## yr:weekday1                              .  
## yr:weekday2                              *  
## yr:weekday3                              *  
## yr:weekday4                              ***
## yr:weekday5                              ***
## yr:weekday6                              ***
## mnthWeekend2:temp                        ***
## mnthWeekend3:temp                           
## mnthWeekend4:temp                        ***
## mnthWeekend5:temp                        .  
## mnthWeekend6:temp                        ***
## mnthWeekend7:temp                        ***
## mnthWeekend8:temp                        ***
## mnthWeekend9:temp                        ***
## mnthWeekend10:temp                       ***
## mnthWeekend11:temp                          
## mnthWeekend12:temp                          
## mnthWeekend13:temp                       ** 
## mnthWeekend14:temp                       ***
## mnthWeekend15:temp                       ***
## mnthWeekend16:temp                          
## mnthWeekend17:temp                          
## mnthWeekend18:temp                       ** 
## mnthWeekend19:temp                          
## mnthWeekend20:temp                       ***
## mnthWeekend21:temp                       ***
## mnthWeekend22:temp                       ***
## mnthWeekend23:temp                       ***
## mnthWeekend24:temp                       ***
## mnthWeekend25:temp                       .  
## previousCasual:weathersit1               ** 
## previousCasual:weathersit2               .  
## previousCasual:weathersit3                  
## holiday:holidayseasons0                  ***
## holiday:holidayseasons4                  ***
## holiday:holidayseasons7                  ***
## holiday:holidayseasons10                 ***
## holiday:holidayseasons11                 ***
## yr:mnthAvgCasual:slowseason0             *  
## yr:mnthAvgCasual:slowseason1             ** 
## weekday0:season1:windspeed               ***
## weekday1:season1:windspeed                  
## weekday2:season1:windspeed                  
## weekday3:season1:windspeed                  
## weekday4:season1:windspeed               *  
## weekday5:season1:windspeed               *  
## weekday6:season1:windspeed               ***
## weekday0:season2:windspeed                  
## weekday1:season2:windspeed                  
## weekday2:season2:windspeed               ** 
## weekday3:season2:windspeed               *  
## weekday4:season2:windspeed                  
## weekday5:season2:windspeed                  
## weekday6:season2:windspeed                  
## weekday0:season3:windspeed               *  
## weekday1:season3:windspeed               *  
## weekday2:season3:windspeed               *  
## weekday3:season3:windspeed               *  
## weekday4:season3:windspeed               *  
## weekday5:season3:windspeed                  
## weekday6:season3:windspeed                  
## weekday0:season4:windspeed               ***
## weekday1:season4:windspeed                  
## weekday2:season4:windspeed               .  
## weekday3:season4:windspeed               *  
## weekday4:season4:windspeed               .  
## weekday5:season4:windspeed                  
## weekday6:season4:windspeed               ** 
## weathersit1:previousweekCasual:hum       ***
## weathersit2:previousweekCasual:hum       ***
## weathersit3:previousweekCasual:hum       ***
## windspeed:hum:seasonWeekend3:tempratAbs  ** 
## windspeed:hum:seasonWeekend4:tempratAbs     
## windspeed:hum:seasonWeekend5:tempratAbs  *  
## windspeed:hum:seasonWeekend6:tempratAbs     
## windspeed:hum:seasonWeekend7:tempratAbs  *  
## windspeed:hum:seasonWeekend8:tempratAbs  .  
## windspeed:hum:seasonWeekend9:tempratAbs     
## windspeed:hum:seasonWeekend10:tempratAbs    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 47040.88)
## 
##     Null deviance: 344158822  on 730  degrees of freedom
## Residual deviance:  30059120  on 639  degrees of freedom
## AIC: 10027
## 
## Number of Fisher Scoring iterations: 2

vif(glm.1)

##                                                GVIF Df GVIF^(1/(2*Df))
## mnth                                   1.937211e+19 11        7.528178
## yr:weekday                             4.691561e+02  7        1.551702
## mnthWeekend:temp                       2.043182e+22 24        2.916071
## previousCasual:weathersit              4.798553e+01  3        1.906273
## holiday:holidayseasons                 1.206832e+01  5        1.282817
## yr:mnthAvgCasual:slowseason            1.848836e+01  2        2.073598
## weekday:season:windspeed               1.124336e+07 28        1.336315
## weathersit:previousweekCasual:hum      2.860791e+01  3        1.748830
## windspeed:hum:seasonWeekend:tempratAbs 4.682384e+02  8        1.468602

set.seed(1)
cv.error = cv.glm(day, glm.1)$delta[1]
cv.error

## [1] 59606.02

ResidualMagnitudeSquares = ( (abs(glm.1$residuals) + day$casual) / day$casual) ^ 2

# Residual Magnitude Sum of Squares
RMSS =  sum( ResidualMagnitudeSquares)
RMSS

## [1] 1773.221

plot(1:731,ResidualMagnitudeSquares)

totalpredictednew = glm.1$fitted.values

totalpredicted1=0
totalpredicted2=0
mm=2

formula.predictive =  (casual ~   yr:weekday+ yr:mnthAvgCasual +season:weekday + mnth +  mnth:weekend:atemp + hum:weekend + windspeed:weekend+  previousCasual:weathersit + previousweekCasual:weathersit:hum +  holiday + season:tempratAbs)

# 5x2 cross validation, this is explained in detail later under the boosted trees section
for (m in 1:mm) {
  set.seed(m)
  folds = sample(1:5, nrow(day), replace = TRUE)
  
  for (j in 1:5) {
    set.seed(1)
    glm.2 = glm(formula.predictive, data=day[folds!=j,])
    predicted = predict(glm.2, newdata = day[folds == j,])
    
    
    
    
    
    if (m == 1) {
      #first loop, cbind adds an index which we will merge on later, rbind combines all 731 predictions into one object
      day.predicted = cbind(day$instant[folds == j], '3' = predicted)
      totalpredicted1 = rbind(totalpredicted1, day.predicted)
    }
    else{
      #second loop
      day.predicted = cbind(day$instant[folds == j], '4' = predicted)
      totalpredicted2 = rbind(totalpredicted2, day.predicted)
      
    }
  }
}
totalpredictednew = merge(totalpredicted1, totalpredicted2, by = 1)[-1, ]
totalpredictednew$average = rowMeans(totalpredictednew[, 2:3],)

#(((totalpredictednew$average - day$casual)*100/day$casual)^2)

glmresiduals = totalpredictednew$average - day$casual

ResidualMagnitudeSquares = ((abs(glmresiduals) + day$casual) / day$casual) ^2

RMSS =  sum( ResidualMagnitudeSquares)
RMSS

plot(1:731,ResidualMagnitudeSquares)

Diagnostic plots, here I take the outliers, the elements with the largest residuals, and the elements with the largest leverage and try to determine what features I could add/modify in order to improve the model fit

d=302
#diag.cols = c(3:5, 7:10, 12:13, 17:20,23, 25, 26:32 )
diag.cols = c('week', 'seasonWeekend', 'atemp', 'previousweekCasual')
par(mfrow=c(2,2), mar=c(4,4,4,4),oma = c(0, 0, 2, 2))

for(x in diag.cols)
{
  plot(day[,x],day[,14], pch = 18,col=day.cols ,xlab = colnames(day)[x], ylab='Casual Users')
  if(x=='seasonWeekend'){
    legend(x = "topright",
       inset = c(0, -.25), # You will need to fine-tune the first
                            # value depending on the windows size
       legend = c("448"), 
       pch=17,
       col = 2,
       xpd = TRUE)
  }
  points(day[d,x], day[d,14],pch=17, col=2)
}

Boosted Trees

Loading the best value from previous tests

library(gbm)

## Loaded gbm 2.1.8

load('bikepercentage1.RData')

The following code performances stepwise selection and parameter selection through cross validation. First I’ll run the model from the last time I did stepwise selection for my gbm to get a baseline.

interactions = 3
flag = FALSE

#folds
k = 5

#number of times we preform the 5 fold cv
mm = 2

# when you get to the point that ''##begin 5fold cv x 2'' shows up, this is the code that runs. We run it here to get a baseline error. Code comments for this cv will be listed here, and left out in subsequent iterations

#these variables store all of the predictions our models make. Doing 5 fold cv, you predict on every observation. Since we are doing 5 fold X 2, we will predict on every observation twice. So we will take all of our predictions, and for each observation we will average our predictions, then calculate RSS as normal.
totalpredicted1 = 0
totalpredicted2 = 0




# outer loop iterates twice
for (m in 1:mm) {
  set.seed(m)
  # calculate new folds for each outer loop. This will help average out potential over/under fitting because our model will use 2 randomized training sets to predict on each observation.
  folds = sample(1:k, nrow(day1), replace = TRUE)
  
  
  
  
  #standard 5 fold.
  for (j in 1:k) {
    set.seed(1)
    day1.boost = gbm(
      casual ~ .,
      data = day1[folds != j,],
      n.trees = besttree,
      interaction.depth = interactions,
      shrinkage = bestshrink,
      n.cores = 4
    )
    predicted = predict(day1.boost, newdata = day1[folds == j,], n.trees = besttree)
    
    
    
    
    
    if (m == 1) {
      #first loop, cbind adds an index which we will merge on later, rbind combines all 731 predictions into one object
      day1.predicted = cbind(daycopy[folds == j, 1], '1' = predicted)
      totalpredicted1 = rbind(totalpredicted1, day1.predicted)
    }
    else{
      #second loop
      day1.predicted = cbind(daycopy[folds == j, 1], '2' = predicted)
      totalpredicted2 = rbind(totalpredicted2, day1.predicted)
      
    }
  }
}

## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...

#merge all of our predictions, then average our predictions for each instance, then use the averaged prediction to calculate the RSS
totalpredicted = merge(totalpredicted1, totalpredicted2, by = 1)[-1, ]
totalpredicted$average = rowMeans(totalpredicted[, 2:3])

par(mfrow=c(1,1), mar=c(4,4,4,4),oma = c(1,1,1,1))

newresiduals = ((totalpredicted$average + totalpredictednew)/2) - day1$casual

plot(1:731, newresiduals, xlab = 'instance', ylab = 'residual')

baseline = sum(((abs(newresiduals) + day1$casual)/day1$casual)^2)

baselineerror = mean((newresiduals ) ^ 2)

Begin Stepwise selection and parameter tuning

#this outerloop alternates between stepwise selection and parameter tuning
for (z in 1:5) {
  #This is the stepwise selection loop, first we check if adding a feature improves performance, then we check if removing a feature improves performance
  
  # currently checking if we need to add/remove features 10 times before moving on to parameter tuning
  flag=FALSE
  for (i in 1:5) {
    print(i)
    print(names(day1))
    print(baseline)

    #day1.baseline is our checkpoint, once we add/remove a feature to the model we use day1.baseline to reset to before the add/remove
    day1.baseline = day1
    day1.boost.cv.add = 1:length(withheld)
    
    #This variable is used to see if we actually added any features for each loop, if we dont add/remove features for one loop, then we will exit the loop and go back to shrinkage/tree selection
    old.names = names(day1)
    
    
    
    
    
    # # #  add features
    print('adding')
    for (x in 1:length(withheld)) {
      print(x)
      day1 = day1.baseline
      day1 = cbind(day1, daycopy[which(names(daycopy) == withheld[x])])
      
      
      
      
      ##begin 5 fold cv X 2
      totalpredicted1 = 0
      totalpredicted2 = 0
      for (m in 1:mm) {
        set.seed(m)
        folds = sample(1:k, nrow(day1), replace = TRUE)
        for (j in 1:k) {
          set.seed(1)
          day1.boost = gbm(
            casual ~ .,
            data = day1[folds != j,],
            n.trees = besttree,
            interaction.depth = interactions,
            shrinkage = bestshrink,
            n.cores = 4,
            distribution = 'gaussian'
          )
          predicted = predict(day1.boost,
                              newdata = day1[folds == j,],
                              n.trees = besttree)
          if (m == 2) {
            day1.predicted = cbind(daycopy[folds == j, 1], '2' = predicted)
            totalpredicted2 = rbind(totalpredicted2, day1.predicted)
          }
          else{
            day1.predicted = cbind(daycopy[folds == j, 1], '1' = predicted)
            totalpredicted1 = rbind(totalpredicted1, day1.predicted)
            
          }
        }
      }
      ##end 5 fold cv
      
      totalpredicted = merge(totalpredicted1, totalpredicted2, by = 1)[-1, ]
      totalpredicted$average = rowMeans(totalpredicted[, 2:3],)
      newresiduals = ((totalpredicted$average + totalpredictednew) / 2) - day1$casual
      
      
      #report the cv error for the x-th feature
      #day1.boost.cv.add[x] = mean((totalpredicted$average - day1$casual) ^ 2)
      #day1.boost.cv.add[x] = mean((abs(((totalpredicted$average + totalpredictednew$average)/2) - day1$casual)/day1$casual))
      
      day1.boost.cv.add[x] = sum(((abs(newresiduals) + day1$casual) / day1$casual) ^
                                   2)
      #day1.boost.cv.add[x] = mean((totalpredicted$average - day1$casual + glmresiduals) ^ 2)
      
      
      
    }
    
    
    
    
    
    #check if any of the x features that are currently withheld make the model better, if they do, add them to the model
    day1 = day1.baseline
    if (min(day1.boost.cv.add) < baseline) {
      print('update')
      plot(1:731, residuals)
      #update our new best error
      baseline = min(day1.boost.cv.add)
      #index for the best addition
      best = which.min(day1.boost.cv.add)
      print(withheld[best])
      #add it back to the model with cbind
      readd = which(names(daycopy) == withheld[best])
      day1 = cbind(day1, daycopy[readd])
      #update our withheld list
      withheld = withheld[-best]
    }
    #update our checkpoint
    day1.baseline = day1
    
    day1.boost.cv.remove = 1:length(names(day1))
    
    
    
    
    
    
    
    
    
    
    
    # # # remove features
    print('removing')
    for (x in 2:length(names(day1))) {
      print(x)
      day1 = day1.baseline[-c(x)]
      
      
      
      
      ##begin 5 fold cv X 2
      totalpredicted1 = 0
      totalpredicted2 = 0
      for (m in 1:mm) {
        set.seed(m)
        folds = sample(1:k, nrow(day1), replace = TRUE)
        for (j in 1:k) {
          set.seed(1)
          day1.boost = gbm(
            casual ~ .,
            data = day1[folds != j,],
            n.trees = besttree,
            interaction.depth = interactions,
            shrinkage = bestshrink,
            n.cores = 4,
            distribution = 'gaussian'
          )
          predicted = predict(day1.boost,
                              newdata = day1[folds == j,],
                              n.trees = besttree)
          if (m == 2) {
            day1.predicted = cbind(daycopy[folds == j, 1], '2' = predicted)
            totalpredicted2 = rbind(totalpredicted2, day1.predicted)
          }
          else{
            day1.predicted = cbind(daycopy[folds == j, 1], '1' = predicted)
            totalpredicted1 = rbind(totalpredicted1, day1.predicted)
            
          }
        }
      }
      ##end 5 fold cv
      
      
      totalpredicted = merge(totalpredicted1, totalpredicted2, by = 1)[-1, ]
      totalpredicted$average = rowMeans(totalpredicted[, 2:3],)
      newresiduals = ((totalpredicted$average + totalpredictednew) / 2) - day1$casual
      
      
      
      #report the cv error for the x-th feature
      
      
      day1.boost.cv.remove[x] = sum(((abs(newresiduals) + day1$casual) /
                                       day1$casual) ^ 2)
    }
    
    #reset to checkpoint
    day1 = day1.baseline
    removederror = min(day1.boost.cv.remove[-1])
    
    #check if removing makes things better
    if (removederror < baseline) {
      print('update')
      plot(1:731, residuals)
      worst = which.min(day1.boost.cv.remove[-1]) + 1
      day1 = day1[-worst]
      day1.removed = names(day1.baseline[worst])
      print(day1.removed)
      withheld = c(day1.removed, withheld)
      baseline = removederror
    }
    
    
    
    
    
    #checking to see if anything was added/removed
    
    if (identical(old.names, names(day1))) {
      #if nothing was added on the very first iteration break out of the stepwise selection
      if (i == 1) {
        flag = TRUE
      }
      break
    }
  }
  
  
  # this sets the ideal shrinkage and tree value by iterating over both.
  
  trees = floor(seq(1000, 7000, length.out = 8))
  shrinks = c(seq(.001, .05, length.out = 10))
  
  #checkpoint the best tree value to see if we changed it at all, if we haven't changed the tree value, and we haven't added/removed, break out of the loop and report the predictors used for the best model.
  old.tree = besttree
  min.loop.error = 0
  for (y in 1:length(trees)) {
    print(y)
    for (x in 1:length(shrinks)) {
      ##begin 5 fold cv X 2
      totalpredicted1 = 0
      totalpredicted2 = 0
      for (m in 1:mm) {
        set.seed(m)
        folds = sample(1:k, nrow(day1), replace = TRUE)
        for (j in 1:k) {
          set.seed(1)
          day1.boost = gbm(
            casual ~ .,
            data = day1[folds != j, ],
            n.trees = trees[y],
            interaction.depth = interactions,
            shrinkage = shrinks[x],
            n.cores = 4,
            distribution = 'gaussian'
          )
          
          predicted = predict(day1.boost,
                              newdata = day1[folds == j,],
                              n.trees = trees[y])
          if (m == 2) {
            day1.predicted = cbind(daycopy[folds == j, 1], '2' = predicted)
            totalpredicted2 = rbind(totalpredicted2, day1.predicted)
          }
          else{
            day1.predicted = cbind(daycopy[folds == j, 1], '1' = predicted)
            totalpredicted1 = rbind(totalpredicted1, day1.predicted)
            
          }
        }
      }
      ##end 5 fold cv
      
      totalpredicted = merge(totalpredicted1, totalpredicted2, by = 1)[-1, ]
      totalpredicted$average = rowMeans(totalpredicted[, 2:3],)
      newresiduals = ((totalpredicted$average + totalpredictednew) / 2) - day1$casual
      
      #set the error for the current loop
      loop.error = sum(((abs(newresiduals) + day1$casual) / day1$casual) ^
                         2)
      #record the minimum error for the current loop
      if (x == 1) {
        min.loop.error = loop.error
      } else if (loop.error < min.loop.error) {
        min.loop.error = loop.error
      }
      
      
      
      #print for diagnostic
      print(loop.error)
      
      
      
      #check if the loop error is better than the current best parameters, if it is use the current parameters as the best
      if (loop.error < baseline) {
        print('update')

        plot(1:731, residuals)
        baseline = loop.error
        besttree = trees[y]
        bestshrink = shrinks[x]
      }
      
      #At first the loop will decresase in error each time since the shrinkage parameter is increasing, but at a certain point the loop will increase in error again, once this happens we can break the loop to speed up the overall process
      if (loop.error / min.loop.error > 1.01) {
        break
      }
    }
  }
  if (old.tree == besttree & flag == TRUE) {
    break
  }
  
}

names(day1)

averagedpredicted = rowMeans(cbind(totalpredicted$average,totalpredictednew))
bestresiduals = averagedpredicted - day1$casual
besterror = mean((averagedpredicted - day1$casual) ^ 2)
save(withheld, besttree, bestshrink, baseline, day1, file='bikepercentage1.RData')

# For 69% of days, the estimate of this model is within 25% of what was actually experienced. 
length(which(abs((averagedpredicted - day$casual) / day$casual ) < .25))/731

## [1] 0.6935705

length(which(abs(bestresiduals)<200))/731

## [1] 0.7633379

with(dayplot, which(mnth %in% 3:9))

how many are within 25% of target

length(which(abs((averagedpredicted - day\(casual) / day\)casual ) > .25))/731

Bike Analysis and Prediction

Kyle Youngblood

Boosted Trees