library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1.9000 ──
## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.4     
## ✔ tidyr   0.8.0          ✔ stringr 1.3.0     
## ✔ readr   1.1.1          ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::vars()   masks ggplot2::vars()
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin

Data and other information from Detroit_Draft_3.Rmd:

complete_tally_set <- read_rds("calculated_tallies.rds")

complete_formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
  num_nearby_blighted_parcels + num_violations_nearby_parcels + earlier_nuiscance_offences

training_parcelnums <- read_rds("training_parcelnums.rds")

We partition the dataset as per k-fold cross validation, and train rpart models on the respective portions

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- complete_tally_set %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(451)
k_folds <- caret::createFolds(train$blighted)

models <- 1:10 %>% map(~ rpart(complete_formula, data = train[-k_folds[[.x]],],
                               method = "class", control = rpart.control(cp = 0.003)))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7105073
sd(as.numeric(accuracies))
## [1] 0.02807713
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 119  42
##   TRUE   67 136
##        truth
## pred      0   1
##   FALSE 139  42
##   TRUE   48 136
##        truth
## pred      0   1
##   FALSE 133  41
##   TRUE   53 137
##        truth
## pred      0   1
##   FALSE 139  54
##   TRUE   47 124
##        truth
## pred      0   1
##   FALSE 111  48
##   TRUE   75 130
##        truth
## pred      0   1
##   FALSE 126  46
##   TRUE   60 132
##        truth
## pred      0   1
##   FALSE 147  59
##   TRUE   39 119
##        truth
## pred      0   1
##   FALSE 128  46
##   TRUE   58 132
##        truth
## pred      0   1
##   FALSE 125  53
##   TRUE   61 125
##        truth
## pred      0   1
##   FALSE 120  49
##   TRUE   66 129
models[[3]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 3277 1602 0 (0.5111382 0.4888618)  
##      2) num_nearby_blighted_parcels< 1.5 1545  460 0 (0.7022654 0.2977346)  
##        4) num_vacant_parcels< 50.5 1291  303 0 (0.7652982 0.2347018)  
##          8) total_fines_by_parcel< 25 918  146 0 (0.8409586 0.1590414) *
##          9) total_fines_by_parcel>=25 373  157 0 (0.5790885 0.4209115)  
##           18) total_fines_by_parcel< 512.5 199   67 0 (0.6633166 0.3366834) *
##           19) total_fines_by_parcel>=512.5 174   84 1 (0.4827586 0.5172414)  
##             38) number_of_violations_by_parcel>=2.5 133   60 0 (0.5488722 0.4511278)  
##               76) total_fines_by_parcel< 3025 107   41 0 (0.6168224 0.3831776)  
##                152) earlier_violent_crime_incidents< 399.5 100   35 0 (0.6500000 0.3500000) *
##                153) earlier_violent_crime_incidents>=399.5 7    1 1 (0.1428571 0.8571429) *
##               77) total_fines_by_parcel>=3025 26    7 1 (0.2692308 0.7307692) *
##             39) number_of_violations_by_parcel< 2.5 41   11 1 (0.2682927 0.7317073) *
##        5) num_vacant_parcels>=50.5 254   97 1 (0.3818898 0.6181102)  
##         10) earlier_property_crime_incidents>=108.5 151   74 1 (0.4900662 0.5099338)  
##           20) total_fines_by_parcel< 75 92   39 0 (0.5760870 0.4239130) *
##           21) total_fines_by_parcel>=75 59   21 1 (0.3559322 0.6440678)  
##             42) earlier_violent_crime_incidents< 137 7    1 0 (0.8571429 0.1428571) *
##             43) earlier_violent_crime_incidents>=137 52   15 1 (0.2884615 0.7115385) *
##         11) earlier_property_crime_incidents< 108.5 103   23 1 (0.2233010 0.7766990) *
##      3) num_nearby_blighted_parcels>=1.5 1732  590 1 (0.3406467 0.6593533)  
##        6) num_nearby_blighted_parcels< 4.5 886  384 1 (0.4334086 0.5665914)  
##         12) num_vacant_parcels< 92.5 699  332 1 (0.4749642 0.5250358)  
##           24) total_fines_by_parcel< 25 466  215 0 (0.5386266 0.4613734)  
##             48) council_di=3,6,7 203   78 0 (0.6157635 0.3842365)  
##               96) earlier_nuiscance_offences< 10.5 196   72 0 (0.6326531 0.3673469) *
##               97) earlier_nuiscance_offences>=10.5 7    1 1 (0.1428571 0.8571429) *
##             49) council_di=1,2,4,5 263  126 1 (0.4790875 0.5209125)  
##               98) num_violations_nearby_parcels>=49.5 40   11 0 (0.7250000 0.2750000) *
##               99) num_violations_nearby_parcels< 49.5 223   97 1 (0.4349776 0.5650224)  
##                198) total_acre>=0.14 35   12 0 (0.6571429 0.3428571) *
##                199) total_acre< 0.14 188   74 1 (0.3936170 0.6063830)  
##                  398) num_vacant_parcels>=89.5 9    2 0 (0.7777778 0.2222222) *
##                  399) num_vacant_parcels< 89.5 179   67 1 (0.3743017 0.6256983)  
##                    798) earlier_property_crime_incidents>=166.5 67   33 0 (0.5074627 0.4925373)  
##                     1596) earlier_violent_crime_incidents< 239.5 32   10 0 (0.6875000 0.3125000) *
##                     1597) earlier_violent_crime_incidents>=239.5 35   12 1 (0.3428571 0.6571429) *
##                    799) earlier_property_crime_incidents< 166.5 112   33 1 (0.2946429 0.7053571) *
##           25) total_fines_by_parcel>=25 233   81 1 (0.3476395 0.6523605) *
##         13) num_vacant_parcels>=92.5 187   52 1 (0.2780749 0.7219251) *
##        7) num_nearby_blighted_parcels>=4.5 846  206 1 (0.2434988 0.7565012) *

Although I believe it is useful to see a quantiative relationship between blight in one building and blight in nearby builings, some may obect to the use of blight to predict blight. (To to clear, the count for number of nearby blighted parcels does not include the parcel for which we are making predictions.) As can be seen below, removing this variable results in a model with an accuracy of roughly 1.5 percentage points lower.

complete_formula_minus_nearby_blight <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
  num_violations_nearby_parcels + earlier_nuiscance_offences

models <- 1:10 %>% map(~ rpart(complete_formula_minus_nearby_blight, data = train[-k_folds[[.x]],],
                               method = "class", control = rpart.control(cp = 0.003)))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6959597
sd(as.numeric(accuracies))
## [1] 0.02561397
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 129  51
##   TRUE   57 127
##        truth
## pred      0   1
##   FALSE 129  49
##   TRUE   58 129
##        truth
## pred      0   1
##   FALSE 138  47
##   TRUE   48 131
##        truth
## pred      0   1
##   FALSE 126  54
##   TRUE   60 124
##        truth
## pred      0   1
##   FALSE 116  58
##   TRUE   70 120
##        truth
## pred      0   1
##   FALSE 120  45
##   TRUE   66 133
##        truth
## pred      0   1
##   FALSE 127  53
##   TRUE   59 125
##        truth
## pred      0   1
##   FALSE 124  45
##   TRUE   62 133
##        truth
## pred      0   1
##   FALSE 132  49
##   TRUE   54 129
##        truth
## pred      0   1
##   FALSE 112  48
##   TRUE   74 130
models[[3]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##    1) root 3277 1602 0 (0.5111382 0.4888618)  
##      2) num_vacant_parcels< 35.5 1484  439 0 (0.7041779 0.2958221)  
##        4) total_fines_by_parcel< 35 1034  220 0 (0.7872340 0.2127660) *
##        5) total_fines_by_parcel>=35 450  219 0 (0.5133333 0.4866667)  
##         10) total_fines_by_parcel< 512.5 236   96 0 (0.5932203 0.4067797) *
##         11) total_fines_by_parcel>=512.5 214   91 1 (0.4252336 0.5747664)  
##           22) number_of_violations_by_parcel>=2.5 154   75 1 (0.4870130 0.5129870)  
##             44) total_fines_by_parcel< 3025 124   57 0 (0.5403226 0.4596774)  
##               88) earlier_violent_crime_incidents< 399.5 115   49 0 (0.5739130 0.4260870)  
##                176) earlier_property_crime_incidents>=246.5 20    2 0 (0.9000000 0.1000000) *
##                177) earlier_property_crime_incidents< 246.5 95   47 0 (0.5052632 0.4947368)  
##                  354) num_violations_nearby_parcels< 19.5 21    5 0 (0.7619048 0.2380952) *
##                  355) num_violations_nearby_parcels>=19.5 74   32 1 (0.4324324 0.5675676) *
##               89) earlier_violent_crime_incidents>=399.5 9    1 1 (0.1111111 0.8888889) *
##             45) total_fines_by_parcel>=3025 30    8 1 (0.2666667 0.7333333) *
##           23) number_of_violations_by_parcel< 2.5 60   16 1 (0.2666667 0.7333333) *
##      3) num_vacant_parcels>=35.5 1793  630 1 (0.3513664 0.6486336)  
##        6) num_vacant_parcels< 65.5 801  352 1 (0.4394507 0.5605493)  
##         12) total_fines_by_parcel< 25 518  255 0 (0.5077220 0.4922780)  
##           24) council_di=4,5,6 205   72 0 (0.6487805 0.3512195)  
##             48) total_acre>=0.0655 191   61 0 (0.6806283 0.3193717) *
##             49) total_acre< 0.0655 14    3 1 (0.2142857 0.7857143) *
##           25) council_di=1,2,3,7 313  130 1 (0.4153355 0.5846645)  
##             50) total_acre< 0.0825 29    9 0 (0.6896552 0.3103448) *
##             51) total_acre>=0.0825 284  110 1 (0.3873239 0.6126761)  
##              102) total_acre>=0.143 22    6 0 (0.7272727 0.2727273) *
##              103) total_acre< 0.143 262   94 1 (0.3587786 0.6412214)  
##                206) improve_issues_tallies>=43.5 87   41 1 (0.4712644 0.5287356)  
##                  412) num_vacant_parcels< 48 51   19 0 (0.6274510 0.3725490) *
##                  413) num_vacant_parcels>=48 36    9 1 (0.2500000 0.7500000) *
##                207) improve_issues_tallies< 43.5 175   53 1 (0.3028571 0.6971429) *
##         13) total_fines_by_parcel>=25 283   89 1 (0.3144876 0.6855124) *
##        7) num_vacant_parcels>=65.5 992  278 1 (0.2802419 0.7197581)  
##         14) num_vacant_parcels< 112.5 631  208 1 (0.3296355 0.6703645)  
##           28) council_di=3,4,5,6 391  154 1 (0.3938619 0.6061381)  
##             56) improve_issues_tallies>=47.5 30    6 0 (0.8000000 0.2000000) *
##             57) improve_issues_tallies< 47.5 361  130 1 (0.3601108 0.6398892)  
##              114) total_fines_by_parcel< 25 240  105 1 (0.4375000 0.5625000)  
##                228) improve_issues_tallies>=17.5 151   75 0 (0.5033113 0.4966887)  
##                  456) improve_issues_tallies< 24.5 54   19 0 (0.6481481 0.3518519)  
##                    912) num_vacant_parcels>=71.5 45   12 0 (0.7333333 0.2666667) *
##                    913) num_vacant_parcels< 71.5 9    2 1 (0.2222222 0.7777778) *
##                  457) improve_issues_tallies>=24.5 97   41 1 (0.4226804 0.5773196)  
##                    914) earlier_violent_crime_incidents< 130.5 14    3 0 (0.7857143 0.2142857) *
##                    915) earlier_violent_crime_incidents>=130.5 83   30 1 (0.3614458 0.6385542)  
##                     1830) frontage>=41 13    4 0 (0.6923077 0.3076923) *
##                     1831) frontage< 41 70   21 1 (0.3000000 0.7000000) *
##                229) improve_issues_tallies< 17.5 89   29 1 (0.3258427 0.6741573) *
##              115) total_fines_by_parcel>=25 121   25 1 (0.2066116 0.7933884) *
##           29) council_di=1,2,7 240   54 1 (0.2250000 0.7750000) *
##         15) num_vacant_parcels>=112.5 361   70 1 (0.1939058 0.8060942)  
##           30) frontage>=66.5 9    2 0 (0.7777778 0.2222222) *
##           31) frontage< 66.5 352   63 1 (0.1789773 0.8210227) *

Now try glm

models <- 1:10 %>% map(~ glm(complete_formula, data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))


#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7072181
sd(as.numeric(accuracies))
## [1] 0.02084383
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 136  66
##   TRUE   50 112
##        truth
## pred      0   1
##   FALSE 143  56
##   TRUE   44 122
##        truth
## pred      0   1
##   FALSE 140  54
##   TRUE   46 124
##        truth
## pred      0   1
##   FALSE 147  61
##   TRUE   39 117
##        truth
## pred      0   1
##   FALSE 129  62
##   TRUE   57 116
##        truth
## pred      0   1
##   FALSE 137  63
##   TRUE   49 115
##        truth
## pred      0   1
##   FALSE 149  63
##   TRUE   37 115
##        truth
## pred      0   1
##   FALSE 142  60
##   TRUE   44 118
##        truth
## pred      0   1
##   FALSE 140  56
##   TRUE   46 122
##        truth
## pred      0   1
##   FALSE 133  60
##   TRUE   53 118
models[[9]]
## 
## Call:  glm(formula = complete_formula, family = "binomial", data = train[-k_folds[[.x]], 
##     ])
## 
## Coefficients:
##                      (Intercept)             total_fines_by_parcel  
##                       -0.9433568                         0.0002123  
##   number_of_violations_by_parcel            improve_issues_tallies  
##                        0.1064047                         0.0014371  
## earlier_property_crime_incidents   earlier_violent_crime_incidents  
##                       -0.0071346                         0.0057590  
##                       total_acre                       council_di2  
##                        0.0645114                        -0.1501909  
##                      council_di3                       council_di4  
##                       -0.0871947                         0.0486041  
##                      council_di5                       council_di6  
##                       -0.6324548                        -0.3312649  
##                      council_di7                          frontage  
##                       -0.1536028                        -0.0037837  
##               num_vacant_parcels       num_nearby_blighted_parcels  
##                        0.0144593                         0.1672508  
##    num_violations_nearby_parcels        earlier_nuiscance_offences  
##                       -0.0057991                         0.0354739  
## 
## Degrees of Freedom: 3276 Total (i.e. Null);  3259 Residual
## Null Deviance:       4541 
## Residual Deviance: 3712  AIC: 3748

Now try support vector machines, without normalizing the variables.

library(e1071)

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- complete_tally_set %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(451)
k_folds <- caret::createFolds(train$blighted)

models <- 1:10 %>% map(~ svm(complete_formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],]))

accuracies <- 1:10 %>% 
  map(~ mean(unlist(predictions[[.x]] == train[k_folds[[.x]],]$blighted)))

#summary statistics over the models
mean(unlist(accuracies))
## [1] 0.7151791
sd(unlist(accuracies))
## [1] 0.02433306
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]]), 
              truth = train[k_folds[[index]],]$blighted))
}
##     truth
## pred   0   1
##    0 126  48
##    1  60 130
##     truth
## pred   0   1
##    0 139  44
##    1  48 134
##     truth
## pred   0   1
##    0 137  43
##    1  49 135
##     truth
## pred   0   1
##    0 134  49
##    1  52 129
##     truth
## pred   0   1
##    0 117  49
##    1  69 129
##     truth
## pred   0   1
##    0 134  46
##    1  52 132
##     truth
## pred   0   1
##    0 139  54
##    1  47 124
##     truth
## pred   0   1
##    0 124  47
##    1  62 131
##     truth
## pred   0   1
##    0 130  47
##    1  56 131
##     truth
## pred   0   1
##    0 125  54
##    1  61 124
#All of the rpart models in the k-fold cross-checking contained only one split---on 
#the total amount of fines related to blight on the parcel, as in the following.
models[[3]]
## 
## Call:
## svm(formula = complete_formula, data = train[-k_folds[[.x]], 
##     ])
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.05555556 
## 
## Number of Support Vectors:  2105

Now try support vector machines, after normalizing the numerical variables

train$blighted <- as.factor(train$blighted)

#cut out the non-numeric columns, scale the numeric columns, and then put the non-numerica columns back in
train_svm <- train %>% select(-parcelnum, -blighted, -council_di) %>% scale() %>% as.tibble() %>% 
  mutate(parcelnum = train$parcelnum, blighted = train$blighted, council_di = train$council_di)

set.seed(451)
k_folds <- caret::createFolds(train_svm$blighted)

models <- 1:10 %>% map(~ svm(complete_formula, data = train_svm[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train_svm[k_folds[[.x]],]))

accuracies <- 1:10 %>% 
  map(~ mean(unlist(predictions[[.x]] == train_svm[k_folds[[.x]],]$blighted)))

#summary statistics over the models
mean(unlist(accuracies))
## [1] 0.7151791
sd(unlist(accuracies))
## [1] 0.02433306
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]]), 
              truth = train_svm[k_folds[[index]],]$blighted))
}
##     truth
## pred   0   1
##    0 126  48
##    1  60 130
##     truth
## pred   0   1
##    0 139  44
##    1  48 134
##     truth
## pred   0   1
##    0 137  43
##    1  49 135
##     truth
## pred   0   1
##    0 134  49
##    1  52 129
##     truth
## pred   0   1
##    0 117  49
##    1  69 129
##     truth
## pred   0   1
##    0 134  46
##    1  52 132
##     truth
## pred   0   1
##    0 139  54
##    1  47 124
##     truth
## pred   0   1
##    0 124  47
##    1  62 131
##     truth
## pred   0   1
##    0 130  47
##    1  56 131
##     truth
## pred   0   1
##    0 125  54
##    1  61 124
#All of the rpart models in the k-fold cross-checking contained only one split---on 
#the total amount of fines related to blight on the parcel, as in the following.
models[[3]]
## 
## Call:
## svm(formula = complete_formula, data = train_svm[-k_folds[[.x]], 
##     ])
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.05555556 
## 
## Number of Support Vectors:  2105

Now try random forest:

library(randomForest)

models <- 1:10 %>% map(~ randomForest(complete_formula, 
                                      data = train[-k_folds[[.x]],], ntree = 2000))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7258919
sd(as.numeric(accuracies))
## [1] 0.0210122
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 126  43
##   TRUE   60 135
##        truth
## pred      0   1
##   FALSE 143  46
##   TRUE   44 132
##        truth
## pred      0   1
##   FALSE 134  42
##   TRUE   52 136
##        truth
## pred      0   1
##   FALSE 126  37
##   TRUE   60 141
##        truth
## pred      0   1
##   FALSE 118  45
##   TRUE   68 133
##        truth
## pred      0   1
##   FALSE 134  47
##   TRUE   52 131
##        truth
## pred      0   1
##   FALSE 142  45
##   TRUE   44 133
##        truth
## pred      0   1
##   FALSE 131  47
##   TRUE   55 131
##        truth
## pred      0   1
##   FALSE 127  49
##   TRUE   59 129
##        truth
## pred      0   1
##   FALSE 128  45
##   TRUE   58 133
models[[3]]
## 
## Call:
##  randomForest(formula = complete_formula, data = train[-k_folds[[.x]],      ], ntree = 2000) 
##                Type of random forest: classification
##                      Number of trees: 2000
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 27.13%
## Confusion matrix:
##      0    1 class.error
## 0 1186  489   0.2919403
## 1  400 1202   0.2496879

Now try adaboost. The first step is to define an adaboost function, in terms of rpart.

#returns a prediction on the "test" dataset
demolition_adaboost <- function(ada_train, ada_test, formula, iterations, tree_depth, min_cp) {
  
  num_rows <- nrow(ada_train)
  
  #initialize the weights to be applied to each example in the training set
  ada_train$wts <- 1/num_rows         
  
  models <- list() #initializing the list of rpart models
  pred_weights <- list() #initiallizing a sum to be taken over the iterations in the for loop
  adjustment_factors <- list() #initializing a list of the factors used to adjust the example weights
  pred_list <- list()  #predictions on the test dataset from EACH tree (NOT the sum)
  for (index in 1:iterations) {
    
    #In this implementation of Adaboost, the test set is sampled with replacement, and then the 
    #weights (as prescribed in the Adaboost meta-algorithm) are used to determine the relative import
    #of the respective datapoints in an rpart implementation on the sample. I have also experimented with 
    #using the weights to determine the relative probabilities for selection in the sample. However, the
    #performance (accuracy in cross-validation) was rather disapointing, with many of the weights rather                    #quickly converging  to zero. 
    
    ada_resample <- sample_n(ada_train, num_rows, replace = TRUE)
    
    #the weights for the rpart model
    wts <- ada_resample$wts
    
    model <- rpart::rpart(formula, data = ada_resample, 
                   method = "class", weights = wts,
                   control = rpart.control(maxdepth = tree_depth,
                                           cp = min_cp))
      
    #predict over the entire training set
    prediction <- predict(model, newdata = ada_train, type = "class")
    ada_train$Prediction <- prediction
      
    #The predictions on the proper test set are calculated here:
    pred <- predict(model, newdata = ada_test, type="class")
    ada_test$Prediction <- pred
    
    ####ada_test <- arrange(ada_test, parcelnum)
    pred_list[[index]] <- as.numeric(ada_test$Prediction) - 1  #stores prediction for the test set
   
    #We now handle the calculation of the adjustment factor and its application to the example weights
   
     wrong_cases <- ada_train[ada_train$blighted != ada_train$Prediction,]
    sum_weights_misclassified <- sum(wrong_cases$wts)  #epsilon
    adjustment_factor <- 
      sqrt(sum_weights_misclassified / (1 - sum_weights_misclassified))  #beta
    correct_cases <- ada_train[ada_train$blighted == ada_train$Prediction,]
    
    #apply the adjustment factors to the weights
    ada_train <- transform(ada_train,
                      wts = ifelse(parcelnum %in% correct_cases$parcelnum,
                                        wts * adjustment_factor, wts / adjustment_factor))
    #renormalize the weights
    ada_train <- transform(ada_train,
      wts = wts/sum(wts))
    
    #save the weight on the model(s) in this iteration, the model(s), and the adjustment factor
    #in a list (for both calculating the adaboost prediction and for examining after running the
    #program)
    pred_weights[[index]] <- log((1-sum_weights_misclassified)/sum_weights_misclassified)
    models[[index]] <- model
    adjustment_factors[[index]] <- adjustment_factor
    }
  
  #Apply the weighted models to the test data, to derive our predictions
  sum_weighted_predictions <- 0 #initialize the weighted sum of the predictions 
 
  #initialize a list in which the i'th element is the the adaboost prediction for i iterations
  prediction_list <- list() 
  accuracy_list <- list()
  #confusion_matrix_list <- list()
  for (index in 1:iterations) {
    
    pred <- pred_list[[index]] 
    sum_weighted_predictions <- 
      sum_weighted_predictions + (pred - 0.5)*pred_weights[[index]]
    prediction_list[[index]] <- as.numeric(sum_weighted_predictions > 0)
    accuracy_list[[index]] <- mean(as.numeric(prediction_list[[index]] == ada_test$blighted))
    }
  
  predictions_df <- tibble(parcelnum = ada_test$parcelnum, truth = ada_test$blighted)
  for (index in 1:iterations) {
    predictions_df[[index + 2]] <- prediction_list[[index]]
  }
  
  #return a comprehensive set of information
  return(list(predictions = predictions_df, 
              accuracies = accuracy_list, 
              rpart_models = models, 
              prediction_weights = pred_weights,
              example_weights = ada_train$wts,
              adjustment_factors = adjustment_factors,
              predicted_parcels = ada_test$parcelnum))
  }

We now apply the function for training and k-fold cross validation, using a parallel loop for the cross-validation. My various implementations of this approach to the training data has suggested that the adaboost model is most-predictive at between 800 and 1200 boosting iterations. However, for any number of iterations, the accuracy of the models varies. I thus used the average of the predictions with the range from 800 to 1200, rounded to 0 or 1.

ada_iterations <- 3000
tree_depth <- 1
minimum_cp <- 0.006

library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
library(foreach)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
model_info_list <- 
  foreach(index = 1:10,  
          .packages = c("tidyverse", "rpart")) %dopar% { 
            
  cross_train <- train[-k_folds[[index]],]
  cross_test <- train[k_folds[[index]],]
  
  #the information to be added to model_info_list, at this iteration of parallel loop:
  
  prediction_and_info <- 
    demolition_adaboost(cross_train, cross_test, complete_formula, ada_iterations, 
                        tree_depth, minimum_cp)
  }
stopCluster(cl)
rm(cl)

combined_predictions_df <- model_info_list[[1]]$predictions
for (index in 2:10) {
  combined_predictions_df <- rbind(combined_predictions_df, 
                                   model_info_list[[index]]$predictions)
}

predictions <- combined_predictions_df %>% select(800:1200) %>%
  rowMeans()

predictions <- round(predictions)

mean(as.numeric(predictions == combined_predictions_df$truth))
## [1] 0.7234276
rm(model_info_list)
gc()
##             used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells   2565264 137.0   33889254 1809.9         NA  24068872 1285.5
## Vcells 108529827 828.1  337298010 2573.4      16384 396857188 3027.8

Consider the Pearson correlations coefficients (point biserials):

correlations <- complete_tally_set %>% 
  select(-parcelnum) %>%
  mutate(council_di = as.numeric(council_di)) %>%
  cor %>%
  as.data.frame() %>%
  select(blighted)

correlations
##                                      blighted
## later_recorded_crime_incidents   -0.095603291
## blighted                          1.000000000
## number_of_violations_by_parcel    0.115515081
## total_fines_by_parcel             0.099702015
## improve_issues_tallies           -0.003417704
## earlier_property_crime_incidents -0.131612768
## earlier_violent_crime_incidents   0.020889176
## earlier_nuiscance_offences        0.034691110
## total_acre                       -0.031045148
## council_di                        0.033230814
## frontage                         -0.075408866
## num_vacant_parcels                0.358722239
## num_nearby_blighted_parcels       0.369731749
## num_violations_nearby_parcels    -0.038749707
## sum_fines_nearby_parcels         -0.025778042

Cut out the two variables with the weakest correlation:

#the complete formula, again
formula <- blighted ~ number_of_violations_by_parcel + total_fines_by_parcel + earlier_property_crime_incidents + earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels + num_nearby_blighted_parcels + num_violations_nearby_parcels + sum_fines_nearby_parcels +
earlier_nuiscance_offences

#the formula with some weak variables removed
formula <- blighted ~ number_of_violations_by_parcel + total_fines_by_parcel + improve_issues_tallies + earlier_property_crime_incidents + earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels + num_nearby_blighted_parcels + num_violations_nearby_parcels + earlier_nuiscance_offences

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],], 
                               control = rpart.control(cp = 0.005)))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7003447
sd(as.numeric(accuracies))
## [1] 0.02386516
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 136  68
##   TRUE   50 110
##        truth
## pred      0   1
##   FALSE 141  47
##   TRUE   46 131
##        truth
## pred      0   1
##   FALSE 128  43
##   TRUE   58 135
##        truth
## pred      0   1
##   FALSE 142  61
##   TRUE   44 117
##        truth
## pred      0   1
##   FALSE 122  56
##   TRUE   64 122
##        truth
## pred      0   1
##   FALSE 135  57
##   TRUE   51 121
##        truth
## pred      0   1
##   FALSE 129  49
##   TRUE   57 129
##        truth
## pred      0   1
##   FALSE 130  54
##   TRUE   56 124
##        truth
## pred      0   1
##   FALSE 128  62
##   TRUE   58 116
##        truth
## pred      0   1
##   FALSE 131  55
##   TRUE   55 123
models[[2]]
## n= 3276 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3276 1602 0 (0.5109890 0.4890110)  
##    2) num_nearby_blighted_parcels< 1.5 1540  462 0 (0.7000000 0.3000000)  
##      4) num_vacant_parcels< 50.5 1289  309 0 (0.7602793 0.2397207) *
##      5) num_vacant_parcels>=50.5 251   98 1 (0.3904382 0.6095618) *
##    3) num_nearby_blighted_parcels>=1.5 1736  596 1 (0.3433180 0.6566820)  
##      6) num_nearby_blighted_parcels< 4.5 891  384 1 (0.4309764 0.5690236)  
##       12) num_vacant_parcels< 92.5 700  334 1 (0.4771429 0.5228571)  
##         24) number_of_violations_by_parcel< 0.5 457  208 0 (0.5448578 0.4551422)  
##           48) earlier_property_crime_incidents>=167.5 234   89 0 (0.6196581 0.3803419) *
##           49) earlier_property_crime_incidents< 167.5 223  104 1 (0.4663677 0.5336323)  
##             98) council_di=3,4,5,6,7 131   57 0 (0.5648855 0.4351145) *
##             99) council_di=1,2 92   30 1 (0.3260870 0.6739130) *
##         25) number_of_violations_by_parcel>=0.5 243   85 1 (0.3497942 0.6502058) *
##       13) num_vacant_parcels>=92.5 191   50 1 (0.2617801 0.7382199) *
##      7) num_nearby_blighted_parcels>=4.5 845  212 1 (0.2508876 0.7491124) *
models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))


#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7072181
sd(as.numeric(accuracies))
## [1] 0.02084383
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 136  66
##   TRUE   50 112
##        truth
## pred      0   1
##   FALSE 143  56
##   TRUE   44 122
##        truth
## pred      0   1
##   FALSE 140  54
##   TRUE   46 124
##        truth
## pred      0   1
##   FALSE 147  61
##   TRUE   39 117
##        truth
## pred      0   1
##   FALSE 129  62
##   TRUE   57 116
##        truth
## pred      0   1
##   FALSE 137  63
##   TRUE   49 115
##        truth
## pred      0   1
##   FALSE 149  63
##   TRUE   37 115
##        truth
## pred      0   1
##   FALSE 142  60
##   TRUE   44 118
##        truth
## pred      0   1
##   FALSE 140  56
##   TRUE   46 122
##        truth
## pred      0   1
##   FALSE 133  60
##   TRUE   53 118
models[[2]]
## 
## Call:  glm(formula = formula, family = "binomial", data = train[-k_folds[[.x]], 
##     ])
## 
## Coefficients:
##                      (Intercept)    number_of_violations_by_parcel  
##                       -0.8234977                         0.0922817  
##            total_fines_by_parcel            improve_issues_tallies  
##                        0.0003437                         0.0015887  
## earlier_property_crime_incidents   earlier_violent_crime_incidents  
##                       -0.0067456                         0.0052657  
##                       total_acre                       council_di2  
##                        0.0964307                        -0.3267911  
##                      council_di3                       council_di4  
##                       -0.1590868                        -0.0211589  
##                      council_di5                       council_di6  
##                       -0.6832887                        -0.3840206  
##                      council_di7                          frontage  
##                       -0.2411291                        -0.0050790  
##               num_vacant_parcels       num_nearby_blighted_parcels  
##                        0.0151475                         0.1535612  
##    num_violations_nearby_parcels        earlier_nuiscance_offences  
##                       -0.0057401                         0.0344131  
## 
## Degrees of Freedom: 3275 Total (i.e. Null);  3258 Residual
## Null Deviance:       4540 
## Residual Deviance: 3711  AIC: 3747
  1. root 3276 1602 0 (0.5109890 0.4890110)
  2. num_nearby_blighted_parcels< 1.5 1540 462 0 (0.7000000 0.3000000)
    1. num_vacant_parcels< 50.5 1289 309 0 (0.7602793 0.2397207) *
    2. num_vacant_parcels>=50.5 251 98 1 (0.3904382 0.6095618) *
  3. num_nearby_blighted_parcels>=1.5 1736 596 1 (0.3433180 0.6566820) *

Partition the dataset according to whether num_nearby_blighted_parcels < 1.5 and consider the Pearson correlation coefficient (point biserials) within each of the elements of the partition:

low_num_blighted_parcels <- train %>% filter(num_nearby_blighted_parcels < 1.5)

correlations <- low_num_blighted_parcels %>% 
  select(-parcelnum) %>%
  mutate(council_di = as.numeric(council_di), 
         blighted = as.numeric(blighted)) %>%
  cor %>%
  as.data.frame() %>%
  select(blighted)
correlations
##                                      blighted
## later_recorded_crime_incidents   -0.036864208
## blighted                          1.000000000
## number_of_violations_by_parcel    0.171522621
## total_fines_by_parcel             0.167231089
## improve_issues_tallies            0.037581693
## earlier_property_crime_incidents -0.064252048
## earlier_violent_crime_incidents   0.045662418
## earlier_nuiscance_offences        0.059415806
## total_acre                       -0.002345569
## council_di                        0.119472871
## frontage                         -0.035256168
## num_vacant_parcels                0.308840974
## num_nearby_blighted_parcels       0.195161605
## num_violations_nearby_parcels    -0.018366512
## sum_fines_nearby_parcels         -0.011035624
higher_num_blighted_parcels <- train %>% filter(num_nearby_blighted_parcels >= 1.5)

correlations <- higher_num_blighted_parcels %>% 
  select(-parcelnum) %>%
  mutate(council_di = as.numeric(council_di),
         blighted = as.numeric(blighted)) %>%
  cor %>%
  as.data.frame() %>%
  select(blighted)
correlations
##                                      blighted
## later_recorded_crime_incidents   -0.084367085
## blighted                          1.000000000
## number_of_violations_by_parcel    0.098843507
## total_fines_by_parcel             0.065530148
## improve_issues_tallies           -0.041507581
## earlier_property_crime_incidents -0.109006752
## earlier_violent_crime_incidents  -0.069340524
## earlier_nuiscance_offences       -0.002753084
## total_acre                       -0.009036265
## council_di                       -0.083504347
## frontage                         -0.050926429
## num_vacant_parcels                0.183269241
## num_nearby_blighted_parcels       0.199218462
## num_violations_nearby_parcels    -0.070594363
## sum_fines_nearby_parcels         -0.063022911

We try partitioning the dataset according one of the more predictive variables, then running random forest on each of the two subsets.

formula <- blighted ~ number_of_violations_by_parcel + total_fines_by_parcel + earlier_property_crime_incidents + earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels + num_nearby_blighted_parcels + num_violations_nearby_parcels + sum_fines_nearby_parcels +
earlier_nuiscance_offences


#k-fold models for parcels with a low number of nearby blighted parcels
models_1 <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],] %>%
                                        filter(num_nearby_blighted_parcels < 1.5)))

#k-fold models for parcels with a high number of nearby blighted parcels
models_2 <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],] %>%
                                        filter(num_nearby_blighted_parcels >= 1.5)))

predictions_1 <- 1:10 %>% map(~ predict(models_1[[.x]], newdata = train[k_folds[[.x]],] %>%
                                          filter(num_nearby_blighted_parcels < 1.5), 
                                        type = "class"))

predictions_1 <- 1:10 %>% map(~as.numeric(predictions_1[[.x]]) - 1)

predictions_2 <- 1:10 %>% map(~ predict(models_2[[.x]], newdata = train[k_folds[[.x]],] %>%
                                          filter(num_nearby_blighted_parcels >= 1.5), 
                                        type = "class"))

predictions_2 <- 1:10 %>% map(~as.numeric(predictions_2[[.x]]) - 1)

accuracies_1 <- 1:10 %>% 
  map(~ as.numeric(predictions_1[[.x]] == (train[k_folds[[.x]],] %>%
                                          filter(num_nearby_blighted_parcels < 1.5))$blighted))

accuracies_2 <- 1:10 %>% 
  map(~ as.numeric(as.numeric(! predictions_2[[.x]] < 0.5) == (train[k_folds[[.x]],] %>%
                                          filter(num_nearby_blighted_parcels >= 1.5))$blighted))

#summary statistics over the models
mean(unlist(accuracies_1))
## [1] 0.7443609
mean(unlist(accuracies_2))
## [1] 0.6940377
#grand mean
mean(c(unlist(accuracies_1), unlist(accuracies_2)))
## [1] 0.7179346

Consider the following decision tree, which forms the rpart model when the complexity parameter is set relatively high.

  1. root 3276 1602 0 (0.5109890 0.4890110)
  2. num_nearby_blighted_parcels< 1.5 1540 462 0 (0.7000000 0.3000000)
    1. num_vacant_parcels< 50.5 1289 309 0 (0.7602793 0.2397207) *
    2. num_vacant_parcels>=50.5 251 98 1 (0.3904382 0.6095618) *
  3. num_nearby_blighted_parcels>=1.5 1736 596 1 (0.3433180 0.6566820) *

We partition our data according to the end-nodes in our decision tree, than then train a random forest model on each of the three subsets specified by the end nodes.

library(rlang)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, %||%, as_function, flatten, flatten_chr, flatten_dbl,
##     flatten_int, flatten_lgl, invoke, list_along, modify, prepend,
##     rep_along, splice
condition_1 <- expr(num_nearby_blighted_parcels < 1.5 & num_vacant_parcels < 50.5)

condition_2 <- expr(num_nearby_blighted_parcels < 1.5 & num_vacant_parcels >= 50.5)

condition_3 <- expr(num_nearby_blighted_parcels >= 1.5)

cat("Case 1\n")
## Case 1
condition_1
## num_nearby_blighted_parcels < 1.5 & num_vacant_parcels < 50.5
correlations <- train %>% filter(!!condition_1) %>% 
  select(-parcelnum) %>%
  mutate(council_di = as.numeric(council_di), 
         blighted = as.numeric(blighted)) %>%
  cor %>%
  as.data.frame() %>%
  select(blighted)
correlations
##                                     blighted
## later_recorded_crime_incidents    0.08601412
## blighted                          1.00000000
## number_of_violations_by_parcel    0.18193737
## total_fines_by_parcel             0.19494348
## improve_issues_tallies            0.10234646
## earlier_property_crime_incidents  0.05643093
## earlier_violent_crime_incidents   0.14697412
## earlier_nuiscance_offences        0.07716656
## total_acre                       -0.01457370
## council_di                        0.05569572
## frontage                         -0.01025897
## num_vacant_parcels                0.11002305
## num_nearby_blighted_parcels       0.15887367
## num_violations_nearby_parcels     0.04751772
## sum_fines_nearby_parcels          0.04005951
cat("\nCase 2\n")
## 
## Case 2
condition_2
## num_nearby_blighted_parcels < 1.5 & num_vacant_parcels >= 50.5
correlations <- train %>% filter(!!condition_2) %>% 
  select(-parcelnum) %>%
  mutate(council_di = as.numeric(council_di), 
         blighted = as.numeric(blighted)) %>%
  cor %>%
  as.data.frame() %>%
  select(blighted)
correlations
##                                     blighted
## later_recorded_crime_incidents   -0.13442428
## blighted                          1.00000000
## number_of_violations_by_parcel    0.09730020
## total_fines_by_parcel             0.09436696
## improve_issues_tallies           -0.10583204
## earlier_property_crime_incidents -0.15685899
## earlier_violent_crime_incidents  -0.13620956
## earlier_nuiscance_offences       -0.03463447
## total_acre                       -0.03736132
## council_di                       -0.04086219
## frontage                         -0.09196425
## num_vacant_parcels                0.17236660
## num_nearby_blighted_parcels       0.06368801
## num_violations_nearby_parcels    -0.14542858
## sum_fines_nearby_parcels         -0.10425741
cat("\nCase 3\n")
## 
## Case 3
condition_3
## num_nearby_blighted_parcels >= 1.5
correlations <- train %>% filter(!!condition_3) %>% 
  select(-parcelnum) %>%
  mutate(council_di = as.numeric(council_di), 
         blighted = as.numeric(blighted)) %>%
  cor %>%
  as.data.frame() %>%
  select(blighted)
correlations
##                                      blighted
## later_recorded_crime_incidents   -0.084367085
## blighted                          1.000000000
## number_of_violations_by_parcel    0.098843507
## total_fines_by_parcel             0.065530148
## improve_issues_tallies           -0.041507581
## earlier_property_crime_incidents -0.109006752
## earlier_violent_crime_incidents  -0.069340524
## earlier_nuiscance_offences       -0.002753084
## total_acre                       -0.009036265
## council_di                       -0.083504347
## frontage                         -0.050926429
## num_vacant_parcels                0.183269241
## num_nearby_blighted_parcels       0.199218462
## num_violations_nearby_parcels    -0.070594363
## sum_fines_nearby_parcels         -0.063022911
formula_1 <- blighted ~ number_of_violations_by_parcel + total_fines_by_parcel + improve_issues_tallies +
  earlier_property_crime_incidents + earlier_violent_crime_incidents + council_di +
  num_violations_nearby_parcels + earlier_nuiscance_offences

formula_2 <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + council_di + frontage + num_vacant_parcels +
  num_nearby_blighted_parcels + num_violations_nearby_parcels

formula_3 <- blighted ~ number_of_violations_by_parcel + total_fines_by_parcel + improve_issues_tallies +
  earlier_property_crime_incidents + earlier_violent_crime_incidents + total_acre + council_di + frontage +
  num_vacant_parcels + num_violations_nearby_parcels + sum_fines_nearby_parcels

#formula_1 <- formula
#formula_2 <- formula
#formula_3 <- formula

models_1 <- 1:10 %>% map(~ randomForest(formula_1, data = train[-k_folds[[.x]],] %>%
                                        filter(!!condition_1), ntree = 2000))

models_2 <- 1:10 %>% map(~ randomForest(formula_2, data = train[-k_folds[[.x]],] %>%
                                        filter(!!condition_2), ntree = 2000))

models_3 <- 1:10 %>% map(~ randomForest(formula_3, data = train[-k_folds[[.x]],] %>%
                                        filter(!!condition_3), ntree = 2000))

predictions_1 <- 1:10 %>% map(~ predict(models_1[[.x]], newdata = train[k_folds[[.x]],] %>%
                                          filter(!!condition_1), 
                                        type = "class"))

predictions_2 <- 1:10 %>% map(~ predict(models_2[[.x]], newdata = train[k_folds[[.x]],] %>%
                                          filter(!!condition_2), 
                                        type = "class"))

predictions_3 <- 1:10 %>% map(~ predict(models_3[[.x]], newdata = train[k_folds[[.x]],] %>%
                                          filter(!!condition_3), 
                                        type = "class"))

truth_1 <- 1:10 %>% map(~ (train[k_folds[[.x]],] %>% filter(!!condition_1))$blighted)

truth_2 <- 1:10 %>% map(~ (train[k_folds[[.x]],] %>% filter(!!condition_2))$blighted)

truth_3 <- 1:10 %>% map(~ (train[k_folds[[.x]],] %>% filter(!!condition_3))$blighted)

accuracies_1 <- 1:10 %>% map(~ (as.numeric(predictions_1[[.x]] == truth_1[[.x]])))

accuracies_2 <- 1:10 %>% map(~ (as.numeric(predictions_2[[.x]] == truth_2[[.x]])))

accuracies_3 <- 1:10 %>% map(~ (as.numeric(predictions_3[[.x]] == truth_3[[.x]])))

mean(unlist(accuracies_1))
## [1] 0.76434
mean(unlist(accuracies_2))
## [1] 0.6843972
mean(unlist(accuracies_3))
## [1] 0.6830544
#grand mean
mean(c(unlist(accuracies_1), unlist(accuracies_2), unlist(accuracies_3)))
## [1] 0.7154628
cat("\n")
#grand confusion matrix
table(truth = c(unlist(truth_1), unlist(truth_2), unlist(truth_3)) - 1,
      pred = c(unlist(predictions_1), unlist(predictions_2), unlist(predictions_3)) - 1)
##      pred
## truth    0    1
##     0 1285  576
##     1  460 1320

Having trained a model on the complete training set and then testing it on the 15% of the data that was withheld for final testing, I achieved an accuracy of 75%. As I was somewhat skeptical as to whether this figure would be typical, I tried tried k-fold cross testing over the entire balanced dataset.

#ensure that the outcome variable is of type claass
complete_tally_set$blighted <- as.factor(complete_tally_set$blighted)

#set.seed(451)
full_k_folds <- caret::createFolds(complete_tally_set$blighted)
#(with the default of 10 folds)

ada_iterations <- 2000
tree_depth <- 1
minimum_cp <- 0.006

library(doParallel)
library(foreach)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
model_info_list <- 
  foreach(index = 1:10,  
          .packages = c("tidyverse", "rpart")) %dopar% { 
            
  cross_train <- complete_tally_set[-full_k_folds[[index]],]
  cross_test <- complete_tally_set[full_k_folds[[index]],]
  
  #the information to be added to model_info_list, at this iteration of parallel loop:
  
  prediction_and_info <- 
    demolition_adaboost(cross_train, cross_test, complete_formula, ada_iterations, 
                        tree_depth, minimum_cp)
  }
stopCluster(cl)
rm(cl)

#max(accuracy_averages)
#which.max(accuracy_averages)

#names(accuracy_averages) <- 1:length(accuracy_averages)

#3000
#accuracy_averages[1:1000]
#389-390
#486-495
#772-793

#max(accuracy_averages)
#.72613

#rm(model_info_list)
#gc()

combined_predictions_df <- model_info_list[[1]]$predictions
for (index in 2:10) {
  combined_predictions_df <- rbind(combined_predictions_df, 
                                   model_info_list[[index]]$predictions)
}

#for (index in 3:ncol(combined_predictions_df)) {
#  print(c(index - 2, mean(combined_predictions_df$truth == combined_predictions_df[[index]])))
#}

predictions <- combined_predictions_df %>% select(900:1100) %>%
  rowMeans()

predictions <- round(predictions)

mean(as.numeric(predictions == combined_predictions_df$truth))
## [1] 0.7226243

Remove the information returned by the parallel loop, which includes each of the rpart models that were generated as part of adaboost.

rm(model_info_list)
gc()
##             used  (Mb) gc trigger (Mb) limit (Mb)  max used   (Mb)
## Ncells   2578914 137.8   27111403 1448         NA  24068872 1285.5
## Vcells 113567746 866.5  323870089 2471      16384 396857188 3027.8

Now look at random forest over the complete balanced set.

#ensure that the outcome variable is of type claass
complete_tally_set$blighted <- as.factor(complete_tally_set$blighted)

#set.seed(555)
full_k_folds <- caret::createFolds(complete_tally_set$blighted)
#(with the default of 10 folds)

library(randomForest)

library(doParallel)
library(foreach)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
randomForest_models <- 
  foreach(index = 1:10,  
          .packages = c("tidyverse", "randomForest")) %dopar% { 

  randomForest(formula, 
              complete_tally_set[-full_k_folds[[index]],], 
              ntree = 1000)  
            
  }
stopCluster(cl)
rm(cl)            
      
predictions <- 1:10 %>% map(~ predict(randomForest_models[[.x]], 
                                      newdata = complete_tally_set[full_k_folds[[.x]],], 
                                      type = "class"))

#predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(predictions[[.x]] == complete_tally_set[full_k_folds[[.x]],]$blighted))

#mean accuracy for each fold
accuracies
## [[1]]
## [1] 0.7242991
## 
## [[2]]
## [1] 0.7266355
## 
## [[3]]
## [1] 0.7296037
## 
## [[4]]
## [1] 0.7056075
## 
## [[5]]
## [1] 0.7149533
## 
## [[6]]
## [1] 0.7272727
## 
## [[7]]
## [1] 0.7079439
## 
## [[8]]
## [1] 0.7365967
## 
## [[9]]
## [1] 0.7266355
## 
## [[10]]
## [1] 0.7102804
#mean accuracy and standard deviation over the folds
mean(unlist(accuracies))
## [1] 0.7209828
sd(unlist(accuracies))
## [1] 0.01048752
#confusion matrices for the models
for(index in 1:10) {
print(table(pred = predictions[[index]], 
      truth = complete_tally_set[full_k_folds[[index]],]$blighted))
}
##     truth
## pred   0   1
##    0 163  61
##    1  57 147
##     truth
## pred   0   1
##    0 155  52
##    1  65 156
##     truth
## pred   0   1
##    0 158  54
##    1  62 155
##     truth
## pred   0   1
##    0 144  50
##    1  76 158
##     truth
## pred   0   1
##    0 155  57
##    1  65 151
##     truth
## pred   0   1
##    0 154  51
##    1  66 158
##     truth
## pred   0   1
##    0 145  50
##    1  75 158
##     truth
## pred   0   1
##    0 158  51
##    1  62 158
##     truth
## pred   0   1
##    0 168  65
##    1  52 143
##     truth
## pred   0   1
##    0 158  62
##    1  62 146
randomForest_models[[4]]
## 
## Call:
##  randomForest(formula = formula, data = complete_tally_set[-full_k_folds[[index]],      ], ntree = 1000) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 27.29%
## Confusion matrix:
##      0    1 class.error
## 0 1400  580   0.2929293
## 1  472 1403   0.2517333