library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1.9000 ──
## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::vars() masks ggplot2::vars()
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
Data and other information from Detroit_Draft_3.Rmd
:
complete_tally_set <- read_rds("calculated_tallies.rds")
complete_formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel +
improve_issues_tallies + earlier_property_crime_incidents +
earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
num_nearby_blighted_parcels + num_violations_nearby_parcels + earlier_nuiscance_offences
training_parcelnums <- read_rds("training_parcelnums.rds")
We partition the dataset as per k-fold cross validation, and train rpart models on the respective portions
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- complete_tally_set %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(451)
k_folds <- caret::createFolds(train$blighted)
models <- 1:10 %>% map(~ rpart(complete_formula, data = train[-k_folds[[.x]],],
method = "class", control = rpart.control(cp = 0.003)))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7105073
sd(as.numeric(accuracies))
## [1] 0.02807713
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 119 42
## TRUE 67 136
## truth
## pred 0 1
## FALSE 139 42
## TRUE 48 136
## truth
## pred 0 1
## FALSE 133 41
## TRUE 53 137
## truth
## pred 0 1
## FALSE 139 54
## TRUE 47 124
## truth
## pred 0 1
## FALSE 111 48
## TRUE 75 130
## truth
## pred 0 1
## FALSE 126 46
## TRUE 60 132
## truth
## pred 0 1
## FALSE 147 59
## TRUE 39 119
## truth
## pred 0 1
## FALSE 128 46
## TRUE 58 132
## truth
## pred 0 1
## FALSE 125 53
## TRUE 61 125
## truth
## pred 0 1
## FALSE 120 49
## TRUE 66 129
models[[3]]
## n= 3277
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3277 1602 0 (0.5111382 0.4888618)
## 2) num_nearby_blighted_parcels< 1.5 1545 460 0 (0.7022654 0.2977346)
## 4) num_vacant_parcels< 50.5 1291 303 0 (0.7652982 0.2347018)
## 8) total_fines_by_parcel< 25 918 146 0 (0.8409586 0.1590414) *
## 9) total_fines_by_parcel>=25 373 157 0 (0.5790885 0.4209115)
## 18) total_fines_by_parcel< 512.5 199 67 0 (0.6633166 0.3366834) *
## 19) total_fines_by_parcel>=512.5 174 84 1 (0.4827586 0.5172414)
## 38) number_of_violations_by_parcel>=2.5 133 60 0 (0.5488722 0.4511278)
## 76) total_fines_by_parcel< 3025 107 41 0 (0.6168224 0.3831776)
## 152) earlier_violent_crime_incidents< 399.5 100 35 0 (0.6500000 0.3500000) *
## 153) earlier_violent_crime_incidents>=399.5 7 1 1 (0.1428571 0.8571429) *
## 77) total_fines_by_parcel>=3025 26 7 1 (0.2692308 0.7307692) *
## 39) number_of_violations_by_parcel< 2.5 41 11 1 (0.2682927 0.7317073) *
## 5) num_vacant_parcels>=50.5 254 97 1 (0.3818898 0.6181102)
## 10) earlier_property_crime_incidents>=108.5 151 74 1 (0.4900662 0.5099338)
## 20) total_fines_by_parcel< 75 92 39 0 (0.5760870 0.4239130) *
## 21) total_fines_by_parcel>=75 59 21 1 (0.3559322 0.6440678)
## 42) earlier_violent_crime_incidents< 137 7 1 0 (0.8571429 0.1428571) *
## 43) earlier_violent_crime_incidents>=137 52 15 1 (0.2884615 0.7115385) *
## 11) earlier_property_crime_incidents< 108.5 103 23 1 (0.2233010 0.7766990) *
## 3) num_nearby_blighted_parcels>=1.5 1732 590 1 (0.3406467 0.6593533)
## 6) num_nearby_blighted_parcels< 4.5 886 384 1 (0.4334086 0.5665914)
## 12) num_vacant_parcels< 92.5 699 332 1 (0.4749642 0.5250358)
## 24) total_fines_by_parcel< 25 466 215 0 (0.5386266 0.4613734)
## 48) council_di=3,6,7 203 78 0 (0.6157635 0.3842365)
## 96) earlier_nuiscance_offences< 10.5 196 72 0 (0.6326531 0.3673469) *
## 97) earlier_nuiscance_offences>=10.5 7 1 1 (0.1428571 0.8571429) *
## 49) council_di=1,2,4,5 263 126 1 (0.4790875 0.5209125)
## 98) num_violations_nearby_parcels>=49.5 40 11 0 (0.7250000 0.2750000) *
## 99) num_violations_nearby_parcels< 49.5 223 97 1 (0.4349776 0.5650224)
## 198) total_acre>=0.14 35 12 0 (0.6571429 0.3428571) *
## 199) total_acre< 0.14 188 74 1 (0.3936170 0.6063830)
## 398) num_vacant_parcels>=89.5 9 2 0 (0.7777778 0.2222222) *
## 399) num_vacant_parcels< 89.5 179 67 1 (0.3743017 0.6256983)
## 798) earlier_property_crime_incidents>=166.5 67 33 0 (0.5074627 0.4925373)
## 1596) earlier_violent_crime_incidents< 239.5 32 10 0 (0.6875000 0.3125000) *
## 1597) earlier_violent_crime_incidents>=239.5 35 12 1 (0.3428571 0.6571429) *
## 799) earlier_property_crime_incidents< 166.5 112 33 1 (0.2946429 0.7053571) *
## 25) total_fines_by_parcel>=25 233 81 1 (0.3476395 0.6523605) *
## 13) num_vacant_parcels>=92.5 187 52 1 (0.2780749 0.7219251) *
## 7) num_nearby_blighted_parcels>=4.5 846 206 1 (0.2434988 0.7565012) *
Although I believe it is useful to see a quantiative relationship between blight in one building and blight in nearby builings, some may obect to the use of blight to predict blight. (To to clear, the count for number of nearby blighted parcels does not include the parcel for which we are making predictions.) As can be seen below, removing this variable results in a model with an accuracy of roughly 1.5 percentage points lower.
complete_formula_minus_nearby_blight <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel +
improve_issues_tallies + earlier_property_crime_incidents +
earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
num_violations_nearby_parcels + earlier_nuiscance_offences
models <- 1:10 %>% map(~ rpart(complete_formula_minus_nearby_blight, data = train[-k_folds[[.x]],],
method = "class", control = rpart.control(cp = 0.003)))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6959597
sd(as.numeric(accuracies))
## [1] 0.02561397
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 129 51
## TRUE 57 127
## truth
## pred 0 1
## FALSE 129 49
## TRUE 58 129
## truth
## pred 0 1
## FALSE 138 47
## TRUE 48 131
## truth
## pred 0 1
## FALSE 126 54
## TRUE 60 124
## truth
## pred 0 1
## FALSE 116 58
## TRUE 70 120
## truth
## pred 0 1
## FALSE 120 45
## TRUE 66 133
## truth
## pred 0 1
## FALSE 127 53
## TRUE 59 125
## truth
## pred 0 1
## FALSE 124 45
## TRUE 62 133
## truth
## pred 0 1
## FALSE 132 49
## TRUE 54 129
## truth
## pred 0 1
## FALSE 112 48
## TRUE 74 130
models[[3]]
## n= 3277
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3277 1602 0 (0.5111382 0.4888618)
## 2) num_vacant_parcels< 35.5 1484 439 0 (0.7041779 0.2958221)
## 4) total_fines_by_parcel< 35 1034 220 0 (0.7872340 0.2127660) *
## 5) total_fines_by_parcel>=35 450 219 0 (0.5133333 0.4866667)
## 10) total_fines_by_parcel< 512.5 236 96 0 (0.5932203 0.4067797) *
## 11) total_fines_by_parcel>=512.5 214 91 1 (0.4252336 0.5747664)
## 22) number_of_violations_by_parcel>=2.5 154 75 1 (0.4870130 0.5129870)
## 44) total_fines_by_parcel< 3025 124 57 0 (0.5403226 0.4596774)
## 88) earlier_violent_crime_incidents< 399.5 115 49 0 (0.5739130 0.4260870)
## 176) earlier_property_crime_incidents>=246.5 20 2 0 (0.9000000 0.1000000) *
## 177) earlier_property_crime_incidents< 246.5 95 47 0 (0.5052632 0.4947368)
## 354) num_violations_nearby_parcels< 19.5 21 5 0 (0.7619048 0.2380952) *
## 355) num_violations_nearby_parcels>=19.5 74 32 1 (0.4324324 0.5675676) *
## 89) earlier_violent_crime_incidents>=399.5 9 1 1 (0.1111111 0.8888889) *
## 45) total_fines_by_parcel>=3025 30 8 1 (0.2666667 0.7333333) *
## 23) number_of_violations_by_parcel< 2.5 60 16 1 (0.2666667 0.7333333) *
## 3) num_vacant_parcels>=35.5 1793 630 1 (0.3513664 0.6486336)
## 6) num_vacant_parcels< 65.5 801 352 1 (0.4394507 0.5605493)
## 12) total_fines_by_parcel< 25 518 255 0 (0.5077220 0.4922780)
## 24) council_di=4,5,6 205 72 0 (0.6487805 0.3512195)
## 48) total_acre>=0.0655 191 61 0 (0.6806283 0.3193717) *
## 49) total_acre< 0.0655 14 3 1 (0.2142857 0.7857143) *
## 25) council_di=1,2,3,7 313 130 1 (0.4153355 0.5846645)
## 50) total_acre< 0.0825 29 9 0 (0.6896552 0.3103448) *
## 51) total_acre>=0.0825 284 110 1 (0.3873239 0.6126761)
## 102) total_acre>=0.143 22 6 0 (0.7272727 0.2727273) *
## 103) total_acre< 0.143 262 94 1 (0.3587786 0.6412214)
## 206) improve_issues_tallies>=43.5 87 41 1 (0.4712644 0.5287356)
## 412) num_vacant_parcels< 48 51 19 0 (0.6274510 0.3725490) *
## 413) num_vacant_parcels>=48 36 9 1 (0.2500000 0.7500000) *
## 207) improve_issues_tallies< 43.5 175 53 1 (0.3028571 0.6971429) *
## 13) total_fines_by_parcel>=25 283 89 1 (0.3144876 0.6855124) *
## 7) num_vacant_parcels>=65.5 992 278 1 (0.2802419 0.7197581)
## 14) num_vacant_parcels< 112.5 631 208 1 (0.3296355 0.6703645)
## 28) council_di=3,4,5,6 391 154 1 (0.3938619 0.6061381)
## 56) improve_issues_tallies>=47.5 30 6 0 (0.8000000 0.2000000) *
## 57) improve_issues_tallies< 47.5 361 130 1 (0.3601108 0.6398892)
## 114) total_fines_by_parcel< 25 240 105 1 (0.4375000 0.5625000)
## 228) improve_issues_tallies>=17.5 151 75 0 (0.5033113 0.4966887)
## 456) improve_issues_tallies< 24.5 54 19 0 (0.6481481 0.3518519)
## 912) num_vacant_parcels>=71.5 45 12 0 (0.7333333 0.2666667) *
## 913) num_vacant_parcels< 71.5 9 2 1 (0.2222222 0.7777778) *
## 457) improve_issues_tallies>=24.5 97 41 1 (0.4226804 0.5773196)
## 914) earlier_violent_crime_incidents< 130.5 14 3 0 (0.7857143 0.2142857) *
## 915) earlier_violent_crime_incidents>=130.5 83 30 1 (0.3614458 0.6385542)
## 1830) frontage>=41 13 4 0 (0.6923077 0.3076923) *
## 1831) frontage< 41 70 21 1 (0.3000000 0.7000000) *
## 229) improve_issues_tallies< 17.5 89 29 1 (0.3258427 0.6741573) *
## 115) total_fines_by_parcel>=25 121 25 1 (0.2066116 0.7933884) *
## 29) council_di=1,2,7 240 54 1 (0.2250000 0.7750000) *
## 15) num_vacant_parcels>=112.5 361 70 1 (0.1939058 0.8060942)
## 30) frontage>=66.5 9 2 0 (0.7777778 0.2222222) *
## 31) frontage< 66.5 352 63 1 (0.1789773 0.8210227) *
Now try glm
models <- 1:10 %>% map(~ glm(complete_formula, data = train[-k_folds[[.x]],], family = "binomial"))
predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
type = "response"))
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7072181
sd(as.numeric(accuracies))
## [1] 0.02084383
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 136 66
## TRUE 50 112
## truth
## pred 0 1
## FALSE 143 56
## TRUE 44 122
## truth
## pred 0 1
## FALSE 140 54
## TRUE 46 124
## truth
## pred 0 1
## FALSE 147 61
## TRUE 39 117
## truth
## pred 0 1
## FALSE 129 62
## TRUE 57 116
## truth
## pred 0 1
## FALSE 137 63
## TRUE 49 115
## truth
## pred 0 1
## FALSE 149 63
## TRUE 37 115
## truth
## pred 0 1
## FALSE 142 60
## TRUE 44 118
## truth
## pred 0 1
## FALSE 140 56
## TRUE 46 122
## truth
## pred 0 1
## FALSE 133 60
## TRUE 53 118
models[[9]]
##
## Call: glm(formula = complete_formula, family = "binomial", data = train[-k_folds[[.x]],
## ])
##
## Coefficients:
## (Intercept) total_fines_by_parcel
## -0.9433568 0.0002123
## number_of_violations_by_parcel improve_issues_tallies
## 0.1064047 0.0014371
## earlier_property_crime_incidents earlier_violent_crime_incidents
## -0.0071346 0.0057590
## total_acre council_di2
## 0.0645114 -0.1501909
## council_di3 council_di4
## -0.0871947 0.0486041
## council_di5 council_di6
## -0.6324548 -0.3312649
## council_di7 frontage
## -0.1536028 -0.0037837
## num_vacant_parcels num_nearby_blighted_parcels
## 0.0144593 0.1672508
## num_violations_nearby_parcels earlier_nuiscance_offences
## -0.0057991 0.0354739
##
## Degrees of Freedom: 3276 Total (i.e. Null); 3259 Residual
## Null Deviance: 4541
## Residual Deviance: 3712 AIC: 3748
Now try support vector machines, without normalizing the variables.
library(e1071)
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- complete_tally_set %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(451)
k_folds <- caret::createFolds(train$blighted)
models <- 1:10 %>% map(~ svm(complete_formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],]))
accuracies <- 1:10 %>%
map(~ mean(unlist(predictions[[.x]] == train[k_folds[[.x]],]$blighted)))
#summary statistics over the models
mean(unlist(accuracies))
## [1] 0.7151791
sd(unlist(accuracies))
## [1] 0.02433306
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]]),
truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## 0 126 48
## 1 60 130
## truth
## pred 0 1
## 0 139 44
## 1 48 134
## truth
## pred 0 1
## 0 137 43
## 1 49 135
## truth
## pred 0 1
## 0 134 49
## 1 52 129
## truth
## pred 0 1
## 0 117 49
## 1 69 129
## truth
## pred 0 1
## 0 134 46
## 1 52 132
## truth
## pred 0 1
## 0 139 54
## 1 47 124
## truth
## pred 0 1
## 0 124 47
## 1 62 131
## truth
## pred 0 1
## 0 130 47
## 1 56 131
## truth
## pred 0 1
## 0 125 54
## 1 61 124
#All of the rpart models in the k-fold cross-checking contained only one split---on
#the total amount of fines related to blight on the parcel, as in the following.
models[[3]]
##
## Call:
## svm(formula = complete_formula, data = train[-k_folds[[.x]],
## ])
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.05555556
##
## Number of Support Vectors: 2105
Now try support vector machines, after normalizing the numerical variables
train$blighted <- as.factor(train$blighted)
#cut out the non-numeric columns, scale the numeric columns, and then put the non-numerica columns back in
train_svm <- train %>% select(-parcelnum, -blighted, -council_di) %>% scale() %>% as.tibble() %>%
mutate(parcelnum = train$parcelnum, blighted = train$blighted, council_di = train$council_di)
set.seed(451)
k_folds <- caret::createFolds(train_svm$blighted)
models <- 1:10 %>% map(~ svm(complete_formula, data = train_svm[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train_svm[k_folds[[.x]],]))
accuracies <- 1:10 %>%
map(~ mean(unlist(predictions[[.x]] == train_svm[k_folds[[.x]],]$blighted)))
#summary statistics over the models
mean(unlist(accuracies))
## [1] 0.7151791
sd(unlist(accuracies))
## [1] 0.02433306
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]]),
truth = train_svm[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## 0 126 48
## 1 60 130
## truth
## pred 0 1
## 0 139 44
## 1 48 134
## truth
## pred 0 1
## 0 137 43
## 1 49 135
## truth
## pred 0 1
## 0 134 49
## 1 52 129
## truth
## pred 0 1
## 0 117 49
## 1 69 129
## truth
## pred 0 1
## 0 134 46
## 1 52 132
## truth
## pred 0 1
## 0 139 54
## 1 47 124
## truth
## pred 0 1
## 0 124 47
## 1 62 131
## truth
## pred 0 1
## 0 130 47
## 1 56 131
## truth
## pred 0 1
## 0 125 54
## 1 61 124
#All of the rpart models in the k-fold cross-checking contained only one split---on
#the total amount of fines related to blight on the parcel, as in the following.
models[[3]]
##
## Call:
## svm(formula = complete_formula, data = train_svm[-k_folds[[.x]],
## ])
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.05555556
##
## Number of Support Vectors: 2105
Now try random forest:
library(randomForest)
models <- 1:10 %>% map(~ randomForest(complete_formula,
data = train[-k_folds[[.x]],], ntree = 2000))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7258919
sd(as.numeric(accuracies))
## [1] 0.0210122
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 126 43
## TRUE 60 135
## truth
## pred 0 1
## FALSE 143 46
## TRUE 44 132
## truth
## pred 0 1
## FALSE 134 42
## TRUE 52 136
## truth
## pred 0 1
## FALSE 126 37
## TRUE 60 141
## truth
## pred 0 1
## FALSE 118 45
## TRUE 68 133
## truth
## pred 0 1
## FALSE 134 47
## TRUE 52 131
## truth
## pred 0 1
## FALSE 142 45
## TRUE 44 133
## truth
## pred 0 1
## FALSE 131 47
## TRUE 55 131
## truth
## pred 0 1
## FALSE 127 49
## TRUE 59 129
## truth
## pred 0 1
## FALSE 128 45
## TRUE 58 133
models[[3]]
##
## Call:
## randomForest(formula = complete_formula, data = train[-k_folds[[.x]], ], ntree = 2000)
## Type of random forest: classification
## Number of trees: 2000
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 27.13%
## Confusion matrix:
## 0 1 class.error
## 0 1186 489 0.2919403
## 1 400 1202 0.2496879
Now try adaboost. The first step is to define an adaboost function, in terms of rpart.
#returns a prediction on the "test" dataset
demolition_adaboost <- function(ada_train, ada_test, formula, iterations, tree_depth, min_cp) {
num_rows <- nrow(ada_train)
#initialize the weights to be applied to each example in the training set
ada_train$wts <- 1/num_rows
models <- list() #initializing the list of rpart models
pred_weights <- list() #initiallizing a sum to be taken over the iterations in the for loop
adjustment_factors <- list() #initializing a list of the factors used to adjust the example weights
pred_list <- list() #predictions on the test dataset from EACH tree (NOT the sum)
for (index in 1:iterations) {
#In this implementation of Adaboost, the test set is sampled with replacement, and then the
#weights (as prescribed in the Adaboost meta-algorithm) are used to determine the relative import
#of the respective datapoints in an rpart implementation on the sample. I have also experimented with
#using the weights to determine the relative probabilities for selection in the sample. However, the
#performance (accuracy in cross-validation) was rather disapointing, with many of the weights rather #quickly converging to zero.
ada_resample <- sample_n(ada_train, num_rows, replace = TRUE)
#the weights for the rpart model
wts <- ada_resample$wts
model <- rpart::rpart(formula, data = ada_resample,
method = "class", weights = wts,
control = rpart.control(maxdepth = tree_depth,
cp = min_cp))
#predict over the entire training set
prediction <- predict(model, newdata = ada_train, type = "class")
ada_train$Prediction <- prediction
#The predictions on the proper test set are calculated here:
pred <- predict(model, newdata = ada_test, type="class")
ada_test$Prediction <- pred
####ada_test <- arrange(ada_test, parcelnum)
pred_list[[index]] <- as.numeric(ada_test$Prediction) - 1 #stores prediction for the test set
#We now handle the calculation of the adjustment factor and its application to the example weights
wrong_cases <- ada_train[ada_train$blighted != ada_train$Prediction,]
sum_weights_misclassified <- sum(wrong_cases$wts) #epsilon
adjustment_factor <-
sqrt(sum_weights_misclassified / (1 - sum_weights_misclassified)) #beta
correct_cases <- ada_train[ada_train$blighted == ada_train$Prediction,]
#apply the adjustment factors to the weights
ada_train <- transform(ada_train,
wts = ifelse(parcelnum %in% correct_cases$parcelnum,
wts * adjustment_factor, wts / adjustment_factor))
#renormalize the weights
ada_train <- transform(ada_train,
wts = wts/sum(wts))
#save the weight on the model(s) in this iteration, the model(s), and the adjustment factor
#in a list (for both calculating the adaboost prediction and for examining after running the
#program)
pred_weights[[index]] <- log((1-sum_weights_misclassified)/sum_weights_misclassified)
models[[index]] <- model
adjustment_factors[[index]] <- adjustment_factor
}
#Apply the weighted models to the test data, to derive our predictions
sum_weighted_predictions <- 0 #initialize the weighted sum of the predictions
#initialize a list in which the i'th element is the the adaboost prediction for i iterations
prediction_list <- list()
accuracy_list <- list()
#confusion_matrix_list <- list()
for (index in 1:iterations) {
pred <- pred_list[[index]]
sum_weighted_predictions <-
sum_weighted_predictions + (pred - 0.5)*pred_weights[[index]]
prediction_list[[index]] <- as.numeric(sum_weighted_predictions > 0)
accuracy_list[[index]] <- mean(as.numeric(prediction_list[[index]] == ada_test$blighted))
}
predictions_df <- tibble(parcelnum = ada_test$parcelnum, truth = ada_test$blighted)
for (index in 1:iterations) {
predictions_df[[index + 2]] <- prediction_list[[index]]
}
#return a comprehensive set of information
return(list(predictions = predictions_df,
accuracies = accuracy_list,
rpart_models = models,
prediction_weights = pred_weights,
example_weights = ada_train$wts,
adjustment_factors = adjustment_factors,
predicted_parcels = ada_test$parcelnum))
}
We now apply the function for training and k-fold cross validation, using a parallel loop for the cross-validation. My various implementations of this approach to the training data has suggested that the adaboost model is most-predictive at between 800 and 1200 boosting iterations. However, for any number of iterations, the accuracy of the models varies. I thus used the average of the predictions with the range from 800 to 1200, rounded to 0 or 1.
ada_iterations <- 3000
tree_depth <- 1
minimum_cp <- 0.006
library(doParallel)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: iterators
## Loading required package: parallel
library(foreach)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
model_info_list <-
foreach(index = 1:10,
.packages = c("tidyverse", "rpart")) %dopar% {
cross_train <- train[-k_folds[[index]],]
cross_test <- train[k_folds[[index]],]
#the information to be added to model_info_list, at this iteration of parallel loop:
prediction_and_info <-
demolition_adaboost(cross_train, cross_test, complete_formula, ada_iterations,
tree_depth, minimum_cp)
}
stopCluster(cl)
rm(cl)
combined_predictions_df <- model_info_list[[1]]$predictions
for (index in 2:10) {
combined_predictions_df <- rbind(combined_predictions_df,
model_info_list[[index]]$predictions)
}
predictions <- combined_predictions_df %>% select(800:1200) %>%
rowMeans()
predictions <- round(predictions)
mean(as.numeric(predictions == combined_predictions_df$truth))
## [1] 0.7234276
rm(model_info_list)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2565264 137.0 33889254 1809.9 NA 24068872 1285.5
## Vcells 108529827 828.1 337298010 2573.4 16384 396857188 3027.8
Consider the Pearson correlations coefficients (point biserials):
correlations <- complete_tally_set %>%
select(-parcelnum) %>%
mutate(council_di = as.numeric(council_di)) %>%
cor %>%
as.data.frame() %>%
select(blighted)
correlations
## blighted
## later_recorded_crime_incidents -0.095603291
## blighted 1.000000000
## number_of_violations_by_parcel 0.115515081
## total_fines_by_parcel 0.099702015
## improve_issues_tallies -0.003417704
## earlier_property_crime_incidents -0.131612768
## earlier_violent_crime_incidents 0.020889176
## earlier_nuiscance_offences 0.034691110
## total_acre -0.031045148
## council_di 0.033230814
## frontage -0.075408866
## num_vacant_parcels 0.358722239
## num_nearby_blighted_parcels 0.369731749
## num_violations_nearby_parcels -0.038749707
## sum_fines_nearby_parcels -0.025778042
Cut out the two variables with the weakest correlation:
#the complete formula, again
formula <- blighted ~ number_of_violations_by_parcel + total_fines_by_parcel + earlier_property_crime_incidents + earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels + num_nearby_blighted_parcels + num_violations_nearby_parcels + sum_fines_nearby_parcels +
earlier_nuiscance_offences
#the formula with some weak variables removed
formula <- blighted ~ number_of_violations_by_parcel + total_fines_by_parcel + improve_issues_tallies + earlier_property_crime_incidents + earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels + num_nearby_blighted_parcels + num_violations_nearby_parcels + earlier_nuiscance_offences
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],],
control = rpart.control(cp = 0.005)))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7003447
sd(as.numeric(accuracies))
## [1] 0.02386516
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 136 68
## TRUE 50 110
## truth
## pred 0 1
## FALSE 141 47
## TRUE 46 131
## truth
## pred 0 1
## FALSE 128 43
## TRUE 58 135
## truth
## pred 0 1
## FALSE 142 61
## TRUE 44 117
## truth
## pred 0 1
## FALSE 122 56
## TRUE 64 122
## truth
## pred 0 1
## FALSE 135 57
## TRUE 51 121
## truth
## pred 0 1
## FALSE 129 49
## TRUE 57 129
## truth
## pred 0 1
## FALSE 130 54
## TRUE 56 124
## truth
## pred 0 1
## FALSE 128 62
## TRUE 58 116
## truth
## pred 0 1
## FALSE 131 55
## TRUE 55 123
models[[2]]
## n= 3276
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3276 1602 0 (0.5109890 0.4890110)
## 2) num_nearby_blighted_parcels< 1.5 1540 462 0 (0.7000000 0.3000000)
## 4) num_vacant_parcels< 50.5 1289 309 0 (0.7602793 0.2397207) *
## 5) num_vacant_parcels>=50.5 251 98 1 (0.3904382 0.6095618) *
## 3) num_nearby_blighted_parcels>=1.5 1736 596 1 (0.3433180 0.6566820)
## 6) num_nearby_blighted_parcels< 4.5 891 384 1 (0.4309764 0.5690236)
## 12) num_vacant_parcels< 92.5 700 334 1 (0.4771429 0.5228571)
## 24) number_of_violations_by_parcel< 0.5 457 208 0 (0.5448578 0.4551422)
## 48) earlier_property_crime_incidents>=167.5 234 89 0 (0.6196581 0.3803419) *
## 49) earlier_property_crime_incidents< 167.5 223 104 1 (0.4663677 0.5336323)
## 98) council_di=3,4,5,6,7 131 57 0 (0.5648855 0.4351145) *
## 99) council_di=1,2 92 30 1 (0.3260870 0.6739130) *
## 25) number_of_violations_by_parcel>=0.5 243 85 1 (0.3497942 0.6502058) *
## 13) num_vacant_parcels>=92.5 191 50 1 (0.2617801 0.7382199) *
## 7) num_nearby_blighted_parcels>=4.5 845 212 1 (0.2508876 0.7491124) *
models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))
predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
type = "response"))
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7072181
sd(as.numeric(accuracies))
## [1] 0.02084383
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 136 66
## TRUE 50 112
## truth
## pred 0 1
## FALSE 143 56
## TRUE 44 122
## truth
## pred 0 1
## FALSE 140 54
## TRUE 46 124
## truth
## pred 0 1
## FALSE 147 61
## TRUE 39 117
## truth
## pred 0 1
## FALSE 129 62
## TRUE 57 116
## truth
## pred 0 1
## FALSE 137 63
## TRUE 49 115
## truth
## pred 0 1
## FALSE 149 63
## TRUE 37 115
## truth
## pred 0 1
## FALSE 142 60
## TRUE 44 118
## truth
## pred 0 1
## FALSE 140 56
## TRUE 46 122
## truth
## pred 0 1
## FALSE 133 60
## TRUE 53 118
models[[2]]
##
## Call: glm(formula = formula, family = "binomial", data = train[-k_folds[[.x]],
## ])
##
## Coefficients:
## (Intercept) number_of_violations_by_parcel
## -0.8234977 0.0922817
## total_fines_by_parcel improve_issues_tallies
## 0.0003437 0.0015887
## earlier_property_crime_incidents earlier_violent_crime_incidents
## -0.0067456 0.0052657
## total_acre council_di2
## 0.0964307 -0.3267911
## council_di3 council_di4
## -0.1590868 -0.0211589
## council_di5 council_di6
## -0.6832887 -0.3840206
## council_di7 frontage
## -0.2411291 -0.0050790
## num_vacant_parcels num_nearby_blighted_parcels
## 0.0151475 0.1535612
## num_violations_nearby_parcels earlier_nuiscance_offences
## -0.0057401 0.0344131
##
## Degrees of Freedom: 3275 Total (i.e. Null); 3258 Residual
## Null Deviance: 4540
## Residual Deviance: 3711 AIC: 3747
Partition the dataset according to whether num_nearby_blighted_parcels
< 1.5 and consider the Pearson correlation coefficient (point biserials) within each of the elements of the partition:
low_num_blighted_parcels <- train %>% filter(num_nearby_blighted_parcels < 1.5)
correlations <- low_num_blighted_parcels %>%
select(-parcelnum) %>%
mutate(council_di = as.numeric(council_di),
blighted = as.numeric(blighted)) %>%
cor %>%
as.data.frame() %>%
select(blighted)
correlations
## blighted
## later_recorded_crime_incidents -0.036864208
## blighted 1.000000000
## number_of_violations_by_parcel 0.171522621
## total_fines_by_parcel 0.167231089
## improve_issues_tallies 0.037581693
## earlier_property_crime_incidents -0.064252048
## earlier_violent_crime_incidents 0.045662418
## earlier_nuiscance_offences 0.059415806
## total_acre -0.002345569
## council_di 0.119472871
## frontage -0.035256168
## num_vacant_parcels 0.308840974
## num_nearby_blighted_parcels 0.195161605
## num_violations_nearby_parcels -0.018366512
## sum_fines_nearby_parcels -0.011035624
higher_num_blighted_parcels <- train %>% filter(num_nearby_blighted_parcels >= 1.5)
correlations <- higher_num_blighted_parcels %>%
select(-parcelnum) %>%
mutate(council_di = as.numeric(council_di),
blighted = as.numeric(blighted)) %>%
cor %>%
as.data.frame() %>%
select(blighted)
correlations
## blighted
## later_recorded_crime_incidents -0.084367085
## blighted 1.000000000
## number_of_violations_by_parcel 0.098843507
## total_fines_by_parcel 0.065530148
## improve_issues_tallies -0.041507581
## earlier_property_crime_incidents -0.109006752
## earlier_violent_crime_incidents -0.069340524
## earlier_nuiscance_offences -0.002753084
## total_acre -0.009036265
## council_di -0.083504347
## frontage -0.050926429
## num_vacant_parcels 0.183269241
## num_nearby_blighted_parcels 0.199218462
## num_violations_nearby_parcels -0.070594363
## sum_fines_nearby_parcels -0.063022911
We try partitioning the dataset according one of the more predictive variables, then running random forest on each of the two subsets.
formula <- blighted ~ number_of_violations_by_parcel + total_fines_by_parcel + earlier_property_crime_incidents + earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels + num_nearby_blighted_parcels + num_violations_nearby_parcels + sum_fines_nearby_parcels +
earlier_nuiscance_offences
#k-fold models for parcels with a low number of nearby blighted parcels
models_1 <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],] %>%
filter(num_nearby_blighted_parcels < 1.5)))
#k-fold models for parcels with a high number of nearby blighted parcels
models_2 <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],] %>%
filter(num_nearby_blighted_parcels >= 1.5)))
predictions_1 <- 1:10 %>% map(~ predict(models_1[[.x]], newdata = train[k_folds[[.x]],] %>%
filter(num_nearby_blighted_parcels < 1.5),
type = "class"))
predictions_1 <- 1:10 %>% map(~as.numeric(predictions_1[[.x]]) - 1)
predictions_2 <- 1:10 %>% map(~ predict(models_2[[.x]], newdata = train[k_folds[[.x]],] %>%
filter(num_nearby_blighted_parcels >= 1.5),
type = "class"))
predictions_2 <- 1:10 %>% map(~as.numeric(predictions_2[[.x]]) - 1)
accuracies_1 <- 1:10 %>%
map(~ as.numeric(predictions_1[[.x]] == (train[k_folds[[.x]],] %>%
filter(num_nearby_blighted_parcels < 1.5))$blighted))
accuracies_2 <- 1:10 %>%
map(~ as.numeric(as.numeric(! predictions_2[[.x]] < 0.5) == (train[k_folds[[.x]],] %>%
filter(num_nearby_blighted_parcels >= 1.5))$blighted))
#summary statistics over the models
mean(unlist(accuracies_1))
## [1] 0.7443609
mean(unlist(accuracies_2))
## [1] 0.6940377
#grand mean
mean(c(unlist(accuracies_1), unlist(accuracies_2)))
## [1] 0.7179346
Consider the following decision tree, which forms the rpart model when the complexity parameter is set relatively high.
We partition our data according to the end-nodes in our decision tree, than then train a random forest model on each of the three subsets specified by the end nodes.
library(rlang)
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, %||%, as_function, flatten, flatten_chr, flatten_dbl,
## flatten_int, flatten_lgl, invoke, list_along, modify, prepend,
## rep_along, splice
condition_1 <- expr(num_nearby_blighted_parcels < 1.5 & num_vacant_parcels < 50.5)
condition_2 <- expr(num_nearby_blighted_parcels < 1.5 & num_vacant_parcels >= 50.5)
condition_3 <- expr(num_nearby_blighted_parcels >= 1.5)
cat("Case 1\n")
## Case 1
condition_1
## num_nearby_blighted_parcels < 1.5 & num_vacant_parcels < 50.5
correlations <- train %>% filter(!!condition_1) %>%
select(-parcelnum) %>%
mutate(council_di = as.numeric(council_di),
blighted = as.numeric(blighted)) %>%
cor %>%
as.data.frame() %>%
select(blighted)
correlations
## blighted
## later_recorded_crime_incidents 0.08601412
## blighted 1.00000000
## number_of_violations_by_parcel 0.18193737
## total_fines_by_parcel 0.19494348
## improve_issues_tallies 0.10234646
## earlier_property_crime_incidents 0.05643093
## earlier_violent_crime_incidents 0.14697412
## earlier_nuiscance_offences 0.07716656
## total_acre -0.01457370
## council_di 0.05569572
## frontage -0.01025897
## num_vacant_parcels 0.11002305
## num_nearby_blighted_parcels 0.15887367
## num_violations_nearby_parcels 0.04751772
## sum_fines_nearby_parcels 0.04005951
cat("\nCase 2\n")
##
## Case 2
condition_2
## num_nearby_blighted_parcels < 1.5 & num_vacant_parcels >= 50.5
correlations <- train %>% filter(!!condition_2) %>%
select(-parcelnum) %>%
mutate(council_di = as.numeric(council_di),
blighted = as.numeric(blighted)) %>%
cor %>%
as.data.frame() %>%
select(blighted)
correlations
## blighted
## later_recorded_crime_incidents -0.13442428
## blighted 1.00000000
## number_of_violations_by_parcel 0.09730020
## total_fines_by_parcel 0.09436696
## improve_issues_tallies -0.10583204
## earlier_property_crime_incidents -0.15685899
## earlier_violent_crime_incidents -0.13620956
## earlier_nuiscance_offences -0.03463447
## total_acre -0.03736132
## council_di -0.04086219
## frontage -0.09196425
## num_vacant_parcels 0.17236660
## num_nearby_blighted_parcels 0.06368801
## num_violations_nearby_parcels -0.14542858
## sum_fines_nearby_parcels -0.10425741
cat("\nCase 3\n")
##
## Case 3
condition_3
## num_nearby_blighted_parcels >= 1.5
correlations <- train %>% filter(!!condition_3) %>%
select(-parcelnum) %>%
mutate(council_di = as.numeric(council_di),
blighted = as.numeric(blighted)) %>%
cor %>%
as.data.frame() %>%
select(blighted)
correlations
## blighted
## later_recorded_crime_incidents -0.084367085
## blighted 1.000000000
## number_of_violations_by_parcel 0.098843507
## total_fines_by_parcel 0.065530148
## improve_issues_tallies -0.041507581
## earlier_property_crime_incidents -0.109006752
## earlier_violent_crime_incidents -0.069340524
## earlier_nuiscance_offences -0.002753084
## total_acre -0.009036265
## council_di -0.083504347
## frontage -0.050926429
## num_vacant_parcels 0.183269241
## num_nearby_blighted_parcels 0.199218462
## num_violations_nearby_parcels -0.070594363
## sum_fines_nearby_parcels -0.063022911
formula_1 <- blighted ~ number_of_violations_by_parcel + total_fines_by_parcel + improve_issues_tallies +
earlier_property_crime_incidents + earlier_violent_crime_incidents + council_di +
num_violations_nearby_parcels + earlier_nuiscance_offences
formula_2 <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel +
improve_issues_tallies + earlier_property_crime_incidents +
earlier_violent_crime_incidents + council_di + frontage + num_vacant_parcels +
num_nearby_blighted_parcels + num_violations_nearby_parcels
formula_3 <- blighted ~ number_of_violations_by_parcel + total_fines_by_parcel + improve_issues_tallies +
earlier_property_crime_incidents + earlier_violent_crime_incidents + total_acre + council_di + frontage +
num_vacant_parcels + num_violations_nearby_parcels + sum_fines_nearby_parcels
#formula_1 <- formula
#formula_2 <- formula
#formula_3 <- formula
models_1 <- 1:10 %>% map(~ randomForest(formula_1, data = train[-k_folds[[.x]],] %>%
filter(!!condition_1), ntree = 2000))
models_2 <- 1:10 %>% map(~ randomForest(formula_2, data = train[-k_folds[[.x]],] %>%
filter(!!condition_2), ntree = 2000))
models_3 <- 1:10 %>% map(~ randomForest(formula_3, data = train[-k_folds[[.x]],] %>%
filter(!!condition_3), ntree = 2000))
predictions_1 <- 1:10 %>% map(~ predict(models_1[[.x]], newdata = train[k_folds[[.x]],] %>%
filter(!!condition_1),
type = "class"))
predictions_2 <- 1:10 %>% map(~ predict(models_2[[.x]], newdata = train[k_folds[[.x]],] %>%
filter(!!condition_2),
type = "class"))
predictions_3 <- 1:10 %>% map(~ predict(models_3[[.x]], newdata = train[k_folds[[.x]],] %>%
filter(!!condition_3),
type = "class"))
truth_1 <- 1:10 %>% map(~ (train[k_folds[[.x]],] %>% filter(!!condition_1))$blighted)
truth_2 <- 1:10 %>% map(~ (train[k_folds[[.x]],] %>% filter(!!condition_2))$blighted)
truth_3 <- 1:10 %>% map(~ (train[k_folds[[.x]],] %>% filter(!!condition_3))$blighted)
accuracies_1 <- 1:10 %>% map(~ (as.numeric(predictions_1[[.x]] == truth_1[[.x]])))
accuracies_2 <- 1:10 %>% map(~ (as.numeric(predictions_2[[.x]] == truth_2[[.x]])))
accuracies_3 <- 1:10 %>% map(~ (as.numeric(predictions_3[[.x]] == truth_3[[.x]])))
mean(unlist(accuracies_1))
## [1] 0.76434
mean(unlist(accuracies_2))
## [1] 0.6843972
mean(unlist(accuracies_3))
## [1] 0.6830544
#grand mean
mean(c(unlist(accuracies_1), unlist(accuracies_2), unlist(accuracies_3)))
## [1] 0.7154628
cat("\n")
#grand confusion matrix
table(truth = c(unlist(truth_1), unlist(truth_2), unlist(truth_3)) - 1,
pred = c(unlist(predictions_1), unlist(predictions_2), unlist(predictions_3)) - 1)
## pred
## truth 0 1
## 0 1285 576
## 1 460 1320
Having trained a model on the complete training set and then testing it on the 15% of the data that was withheld for final testing, I achieved an accuracy of 75%. As I was somewhat skeptical as to whether this figure would be typical, I tried tried k-fold cross testing over the entire balanced dataset.
#ensure that the outcome variable is of type claass
complete_tally_set$blighted <- as.factor(complete_tally_set$blighted)
#set.seed(451)
full_k_folds <- caret::createFolds(complete_tally_set$blighted)
#(with the default of 10 folds)
ada_iterations <- 2000
tree_depth <- 1
minimum_cp <- 0.006
library(doParallel)
library(foreach)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
model_info_list <-
foreach(index = 1:10,
.packages = c("tidyverse", "rpart")) %dopar% {
cross_train <- complete_tally_set[-full_k_folds[[index]],]
cross_test <- complete_tally_set[full_k_folds[[index]],]
#the information to be added to model_info_list, at this iteration of parallel loop:
prediction_and_info <-
demolition_adaboost(cross_train, cross_test, complete_formula, ada_iterations,
tree_depth, minimum_cp)
}
stopCluster(cl)
rm(cl)
#max(accuracy_averages)
#which.max(accuracy_averages)
#names(accuracy_averages) <- 1:length(accuracy_averages)
#3000
#accuracy_averages[1:1000]
#389-390
#486-495
#772-793
#max(accuracy_averages)
#.72613
#rm(model_info_list)
#gc()
combined_predictions_df <- model_info_list[[1]]$predictions
for (index in 2:10) {
combined_predictions_df <- rbind(combined_predictions_df,
model_info_list[[index]]$predictions)
}
#for (index in 3:ncol(combined_predictions_df)) {
# print(c(index - 2, mean(combined_predictions_df$truth == combined_predictions_df[[index]])))
#}
predictions <- combined_predictions_df %>% select(900:1100) %>%
rowMeans()
predictions <- round(predictions)
mean(as.numeric(predictions == combined_predictions_df$truth))
## [1] 0.7226243
Remove the information returned by the parallel loop, which includes each of the rpart models that were generated as part of adaboost.
rm(model_info_list)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 2578914 137.8 27111403 1448 NA 24068872 1285.5
## Vcells 113567746 866.5 323870089 2471 16384 396857188 3027.8
Now look at random forest over the complete balanced set.
#ensure that the outcome variable is of type claass
complete_tally_set$blighted <- as.factor(complete_tally_set$blighted)
#set.seed(555)
full_k_folds <- caret::createFolds(complete_tally_set$blighted)
#(with the default of 10 folds)
library(randomForest)
library(doParallel)
library(foreach)
cl <- makePSOCKcluster(6)
registerDoParallel(cl)
randomForest_models <-
foreach(index = 1:10,
.packages = c("tidyverse", "randomForest")) %dopar% {
randomForest(formula,
complete_tally_set[-full_k_folds[[index]],],
ntree = 1000)
}
stopCluster(cl)
rm(cl)
predictions <- 1:10 %>% map(~ predict(randomForest_models[[.x]],
newdata = complete_tally_set[full_k_folds[[.x]],],
type = "class"))
#predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(predictions[[.x]] == complete_tally_set[full_k_folds[[.x]],]$blighted))
#mean accuracy for each fold
accuracies
## [[1]]
## [1] 0.7242991
##
## [[2]]
## [1] 0.7266355
##
## [[3]]
## [1] 0.7296037
##
## [[4]]
## [1] 0.7056075
##
## [[5]]
## [1] 0.7149533
##
## [[6]]
## [1] 0.7272727
##
## [[7]]
## [1] 0.7079439
##
## [[8]]
## [1] 0.7365967
##
## [[9]]
## [1] 0.7266355
##
## [[10]]
## [1] 0.7102804
#mean accuracy and standard deviation over the folds
mean(unlist(accuracies))
## [1] 0.7209828
sd(unlist(accuracies))
## [1] 0.01048752
#confusion matrices for the models
for(index in 1:10) {
print(table(pred = predictions[[index]],
truth = complete_tally_set[full_k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## 0 163 61
## 1 57 147
## truth
## pred 0 1
## 0 155 52
## 1 65 156
## truth
## pred 0 1
## 0 158 54
## 1 62 155
## truth
## pred 0 1
## 0 144 50
## 1 76 158
## truth
## pred 0 1
## 0 155 57
## 1 65 151
## truth
## pred 0 1
## 0 154 51
## 1 66 158
## truth
## pred 0 1
## 0 145 50
## 1 75 158
## truth
## pred 0 1
## 0 158 51
## 1 62 158
## truth
## pred 0 1
## 0 168 65
## 1 52 143
## truth
## pred 0 1
## 0 158 62
## 1 62 146
randomForest_models[[4]]
##
## Call:
## randomForest(formula = formula, data = complete_tally_set[-full_k_folds[[index]], ], ntree = 1000)
## Type of random forest: classification
## Number of trees: 1000
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 27.29%
## Confusion matrix:
## 0 1 class.error
## 0 1400 580 0.2929293
## 1 472 1403 0.2517333