In this project, I will attempt to predict whether, for any building within a set of identified buildings in Detroit, whether that building will be be targeted for demolition. Potential predictors include citations related to the building, crime, and complaints concerning the building related to blight. I have more-or-less completed the data-cleaning, and I am in the process of associating the various predictors with the buildings. So far I am using about 3GB of data downloaded from https://data.detroitmi.gov/.

library(tidyverse)
library(sf)
library(ggmap)

#recorded violations associated with blight (e.g. unkempt properties)
blight_violations <- read_csv("./data/Blight_Violations_3_19_2018.csv", 
                              guess_max = 10^6)

#read the downloaded file for all the building permits and then filter out the permits for dismantling
dismantle_permits <- read_csv("./data/Building_Permits_3_19_2018.csv", 
                             guess_max = 10^6) %>%
  filter(`Building Permit Type` == "Dismantle")
  
#the files that contain the crime data
crime_to_12062016 <- 
  read_csv("./data/DPD__All_Crime_Incidents__January_1__2009_-_December_6__2016.csv",
           guess_max = 10^6)

crime_12062016_to_03192018 <- 
  read_csv("./data/DPD__All_Crime_Incidents__December_6__2016_-_3_19_2018.csv", 
           guess_max = 10^6)

#the 311 system
improve_detroit_issues <- read_csv("./data/Improve_Detroit_Issues_3_19_2018.csv", 
                                   guess_max = 10^6)

#another file with demolition information downloaded 4/4/2017. I should note that this file appears to cover a somewhat different domain of cases than does dismantle_permits. The difference appears to be that completed_demolitions covers just those buildings that have been dismantled under the Detroit Demolition Program. It may thus be the case that this data omits cases of buildings demolished not because of blight but to make room for something else. 
completed_demolitions <- read_csv("./data/Detroit_Demolitions.csv",
                                  guess_max = 10^6)

upcoming_demolitions <- read_csv("./data/Upcoming_Demolitions.csv")

#the shapefile representing Detroit parcels, read as an sf data frame
parcel_sf <- st_read("./data/Parcel Map")

#convert the parcel number column to a character vector, to match `Parcel Number` in the dismantle permits data
parcel_sf <- parcel_sf %>% mutate(parcelnum = as.character(parcelnum))

#downloaded from http://d3-d3.opendata.arcgis.com/datasets/383eb730952e470389f09617b5448026_0 on 04/13/2018: "Harded Hit Fund Areas, with Expansion"
Hardest_Hit_Fund_Areas <- st_read("./data/Hardest_Hit_Fund_Areas_with_Expansion.kml", 
                                  crs = st_crs(parcel_sf))

city_council_districts <- st_read("./data/City Council Districts")

For all of the downloaded data sets other than the parcels dataset, we extract the usable latitude and longitude values and then use this information to form simple features (sf) objects. Rows with obviously incorrect values, or values that would represent positions well outside Detroit, are filtered out, together with rows for which the latitude or longitude data is missing.

#function for converting the position (character) column into a column of points in the simple features (sf) framework.
add_sf_point <- function(df, column) {
  
  #extract the latitude and longitude from the string column that contains both. With the parentheses
  #located from the end of the strings, it is possible to use the the same function for all five of 
  #the datasets for which we need to extract this information.
  latitude <- str_sub(df[[column]], 
                      stringi::stri_locate_last_fixed(df[[column]], "(")[,2] + 1,
                      stringi::stri_locate_last_fixed(df[[column]], ",")[,1] - 1)
  longitude <- str_sub(df[[column]], 
                       stringi::stri_locate_last_fixed(df[[column]], ", ")[,2] + 1, 
                       stringi::stri_locate_last_fixed(df[[column]], ")")[,1] - 1)
  
  #add the latititude and and longitude to a copy of the dataframe, filter out the NAs from
  #these results, and then convert convert to sf, with point positions indicated in the
  #geometry column
  mutated <- df %>% mutate(extracted_lat = as.double(latitude),
                      extracted_lon = as.double(longitude))
  
  #remove rows with NAs for latitude or longitude, or with values well outside of Detroit
  filtered <- mutated %>%
    filter(!is.na(extracted_lat) & !is.na(extracted_lon)) %>%
    filter(41 < extracted_lat & extracted_lat < 44 & -85 < extracted_lon & extracted_lon < -81)
    
  #create a dataframe from the items that have been filtered out
  result_coord_na <- setdiff(mutated, filtered)
  
  #create sf objects from the rows with usable latitude and longitude information
  result_sf <- st_as_sf(filtered, coords = c("extracted_lon", "extracted_lat"), 
                        crs = st_crs(parcel_sf))
    
  return(list(result_sf, result_coord_na))
}

#apply the function to the seven datasets for which the data was not loaded as a simple features dataframe, thus producing a list of two dataframes for each of the datasets, the first element of the list a simple features data frame and the second element a dataframe with the instances for which it was not possible to convert to simple features
blight_violations_split <- add_sf_point(blight_violations, "Violation Location")

dismantle_permits_split <- add_sf_point(dismantle_permits, "Permit Location")

crime_to_12062016_split <- add_sf_point(crime_to_12062016, "LOCATION")

crime_12062016_to_03192018_split <- add_sf_point(crime_12062016_to_03192018, "Location")

improve_detroit_issues_split <- add_sf_point(improve_detroit_issues, "Location")

completed_demolitions_split <- add_sf_point(completed_demolitions, "Location")

upcoming_demolitions_split <- add_sf_point(upcoming_demolitions, "Location")

We now consider the data for which we do not yet have position data, and complete the information as well as we reasonably can, using the Google API and a function, geocode_pause, that handles some of API’s quirks.

#the portino of the downloaded blight citations data, for which we do not have
blight_vio_na <- blight_violations_split[[2]]

#remove the rows for which geocoding is not likely to prodoce reliable results
useful <- blight_vio_na %>% filter(!is.na(`Violation Street Name`), 
                                   `Violation Street Number` > 0, 
                                   !is.na(`Violation Zip Code`))

#create a column of addresses to be used in geocoding
useful <- useful %>% 
  mutate(complete_address = paste(`Violation Street Number`, " ", `Violation Street Name`, ", ",
                                  "Detroit, Michigan", " ", `Violation Zip Code`, sep = ""))

#function makes a maximum 6 attempts to geocode the given address using the Google API, with a pause of 1 second between attempts. We will use the function for the other datasets as well.
geocode_pause <- function(address) {
  for (index in 1:6) {
    Sys.sleep(1)
    location <- ggmap::geocode(address)
    if (!is.na(location$lon)) {
      return(location)
    }
  }
}
#apply geocode_pause to each of the elements of the complete_addresse column and place the result in a new column, in which each entry is a data frame 
useful <- useful %>% mutate(location = map(complete_address, geocode_pause))

#save to disc, to avoid avoid the need to geocode these addresses again when we rerun the analysis
write_rds(useful, "./data/blight_violations_geocodes.rds")

The geocoding has returned a data frame for each of the addresses. We thus need to unpack the elements of the location column, each of which is a data frame.

#read blight_violations_geocodes as a tibble
blight_violations_geocodes <- read_rds("./data/blight_violations_geocodes.rds")

#function for removing the instances for which geocoding failed (for which the value in the location column is NULLL). We will use this function for all of the geocoded data frames.
remove_null_locations <- function(df) {
  #identify the rows for which the value in the location column is NULL
  null_rows <- list()
  for (index in 1:nrow(df)) {
    if (is.null(df$location[[index]])) {
      null_rows <- c(null_rows, index)
    }
  }
  #remove the rows for which the value of the location column is NULL
  df <- df[-as.integer(null_rows),]
}

blight_violations_geocodes <- remove_null_locations(blight_violations_geocodes)

#With blight_violations_geocodes a tibble, we can apply tidyr::unnest(), which will place the latitude and longitude in columns labelled "lat" and "lon".
blight_violations_geocodes <- blight_violations_geocodes %>% unnest(location)

#fill in the `Violation Latitute` and `Violation Longitude` data frames, which alread exist in the blight_violations data frame
blight_violations_geocodes <- blight_violations_geocodes %>%
  mutate(`Violation Latitude` = lat,
         `Violation Longitude` = lon)

#cut out some columns that have been added
blight_violations_geocodes <- blight_violations_geocodes %>% 
  select(-extracted_lat, -extracted_lon, -complete_address)

#put the position information into a simple features format (which will remove the "lat" and "lon" columns)
blight_violations_geocodes_sf <- st_as_sf(blight_violations_geocodes, 
                                          coords = c("lon", "lat"),
                                          crs = st_crs(parcel_sf))

#combine the results with the previously generated sf data
blight_violations_sf <- rbind(blight_violations_split[[1]], blight_violations_geocodes_sf)

rm(blight_vio_na, blight_violations, blight_violations_geocodes, 
   blight_violations_geocodes_sf, blight_violations_split, useful)
#the dismantle permits for which position data (latitude and longitude) is missing
dismantle_permits_split_na <- dismantle_permits_split[[2]]

#remove the last two columns, which were not contained in the original dismantle_permits datastet
dismantle_permits_split_na <- dismantle_permits_split_na %>% 
  select(-extracted_lat, -extracted_lon)
#geocode the items in dismantle_permits_split_na, using the address column and the function geocode_pause, which makes a maximum of six attempts for each item. The result is list of dataframes in the location column.
dismantle_permits_split_geocode <- dismantle_permits_split_na %>%
  mutate(location = map(str_c(`Site Address`, ", Detroit, Michigan"), geocode_pause))

#write the results of the geocoding to disk, to avoid having to repeat the geocoding when rerunning the analysis.
write_rds(dismantle_permits_split_geocode, "./data/dismantle_permits_geocodes.rds")

rm(dismantle_permits_split_na)
#load the geocoded data frame into R
dismantle_permits_split_geocode <- read_rds("./data/dismantle_permits_geocodes.rds")

#use remove_null_locations() to remove the rows for which geocoding failed and then parse the information in the dataframes in the location column into two new columns, lat and lan
dismantle_permits_split_geocode <- 
  remove_null_locations(dismantle_permits_split_geocode) %>%
  unnest(location)

#convert to a simple features (sf) data frame, using the latititudes and longitudes
dismantle_permits_geocode_sf <- st_as_sf(dismantle_permits_split_geocode, 
                                          coords = c("lon", "lat"),
                                          crs = st_crs(parcel_sf))

#append this simple features dataframe to the dataframe for which we already had usable positions
dismantle_permits_sf <- rbind(dismantle_permits_split[[1]], dismantle_permits_geocode_sf)

rm(dismantle_permits_split_geocode, dismantle_permits_geocode_sf, dismantle_permits, dismantle_permits_split, dismantle_permits_split_na)

We now fill-in the missing position information for the dataset for crimes up to 12-06-2016

#return to the older crime data
crime_to_12062016_leftovers <- crime_to_12062016_split[[2]]

#cut out the addresses that begin with "00"
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>% 
  filter(str_sub(LOCATION, 1, 2) != "00")

#filter out some obviously useless addresses, with few characters before the first "("
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>% 
  filter(!str_locate(LOCATION, "\\(")[,1] %in% 1:13)

#remove the two columns that were added earlier
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>% 
  select(-extracted_lat, -extracted_lon)

#create a column for use in geocoding
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>% 
  mutate(extracted_address = str_c(str_sub(LOCATION, 1, 
                                           str_locate(LOCATION, "\\(")[,1] - 2),
                                                      ", Detroit, Michigan"))
#geocode the elements of extracted_address, using the function geocode_pause
crime_to_12062016_leftovers_geocode <- crime_to_12062016_leftovers %>% 
  mutate(location = map(extracted_address, geocode_pause))

#save the results, to avoid having to geocode again when rerunning the analysis
write_rds(crime_to_12062016_leftovers_geocode, "./data/crime_to_12062016_leftovers_geocode.rds")
#read the saved results
crime_to_12062016_leftovers_geocode <- read_rds("./data/crime_to_12062016_leftovers_geocode.rds")

#cut out the column we used for geocoding
crime_to_12062016_leftovers_geocode <- 
  crime_to_12062016_leftovers_geocode %>% select(-extracted_address)
  
#cut out of the geocode failures and put the location information into the columns lat and lon
crime_to_12062016_leftovers_geocode <- 
  remove_null_locations(crime_to_12062016_leftovers_geocode) %>%
  unnest(location)

#create a simple features (sf) object, using the latititudes and longitudes
crime_to_12062016_leftovers_sf <- st_as_sf(crime_to_12062016_leftovers_geocode, 
                                          coords = c("lon", "lat"),
                                          crs = st_crs(parcel_sf))

#append this simple features dataframe to the dataframe for which we already had locations
crime_to_12062016_sf <- rbind(crime_to_12062016_split[[1]], crime_to_12062016_leftovers_sf)

rm(crime_to_12062016, crime_to_12062016_leftovers_geocode, crime_to_12062016_leftovers_sf, crime_to_12062016_leftovers, crime_to_12062016_split)
#consider the examples in the recent crime data for which the conversion to sf didn't work, remove the two columns that we have added, and create and address column for geocoding
crime_12062016_to_03192018_leftovers <- crime_12062016_to_03192018_split[[2]] %>%
  select(-extracted_lat, -extracted_lon) %>%
  mutate(extracted_address = str_c(`Incident Address`, ", Detroit, Michigan"))
crime_12062016_to_03192018_geocode <- crime_12062016_to_03192018_leftovers %>% 
  mutate(location = map(extracted_address, geocode_pause))

write_rds(crime_12062016_to_03192018_geocode, "./data/crime_12062016_to_03192018_geocode.rds")
crime_12062016_to_03192018_geocode <- read_rds("./data/crime_12062016_to_03192018_geocode.rds") %>%
  select(-extracted_address)

#remove the rows for which the value of location is NULL and then unnest the remaining locations
crime_12062016_to_03192018_geocode <- 
  remove_null_locations(crime_12062016_to_03192018_geocode) %>%
  unnest(location)

#convert the dataframe to a simple features set
crime_12062016_to_03192018_sf <- st_as_sf(crime_12062016_to_03192018_geocode, 
                                          coords = c("lon", "lat"),
                                          crs = st_crs(parcel_sf))  

#combine the geocoded data with the sf dataframe created earlier
crime_12062016_to_03192018 <- rbind(crime_12062016_to_03192018_split[[1]], crime_12062016_to_03192018_sf)

rm(crime_12062016_to_03192018_split, crime_12062016_to_03192018_geocode, crime_12062016_to_03192018_leftovers, crime_12062016_to_03192018_sf)
#geocode the one item in the Improve Detroit Issues data for which the given coordinates were obviously incorrect, and then convert to an sf object. If geocoding fails, run this bit again
improve_detroit_issues_leftover_sf <- improve_detroit_issues_split[[2]] %>%
  select(-extracted_lat, -extracted_lon) %>%
  mutate(location = map(Address, geocode_pause)) %>%
  unnest(location) %>% 
  st_as_sf(coords = c("lon", "lat"), crs = st_crs(parcel_sf))  
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=14530%20%20Vaughan%20Detroit%2C%20Michigan
## Warning: geocode failed with status OVER_QUERY_LIMIT, location = "14530
## Vaughan Detroit, Michigan"
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=14530%20%20Vaughan%20Detroit%2C%20Michigan
#splice with the previously generated sf dataframe
improve_detroit_issues <- rbind(improve_detroit_issues_split[[1]], improve_detroit_issues_leftover_sf)

rm(improve_detroit_issues_split, improve_detroit_issues_leftover_sf)
#create another set of demolition information
completed_demolitions_sf <- completed_demolitions_split[[1]]

#note that location information in completed_demolitions_sf is complete (the dataframe for the examples with complete location information has zero elements)
completed_demolitions_split[[2]]
#another set of demolition information
upcoming_demolitions_sf <- upcoming_demolitions_split[[1]]

upcoming_demolitions_split[[2]]
rm(completed_demolitions, completed_demolitions_split, upcoming_demolitions, upcoming_demolitions_split)

We begin the process of signing labels to the buildings: blighted or not blighted. Buildings will be represented by parcels that have or have had buildings on them, whether by being so represented as in the parcels_sf data frame as including structures or in the dismantle permits dataframe as having had a dismantle permit associated with it, thus suggesting that there was a building on the parcel.

We will use parcel numbers to refer to the parcels. However, as the following bit of code shows, the parcels dataset contains a few rows in which the parcel numbers are the same (duplicate_parcel_numbers_in_parcel_data contains 78 rows).

#introduce a column of row numbers in the parcels data, for use in data cleaning
parcel_sf <- parcel_sf %>%
  mutate(row_num = row_number())

#As per above, following returns a 78-row data frame
duplicate_parcel_numbers_in_parcel_data <- 
  parcel_sf %>%
  group_by(parcelnum) %>% 
  mutate(n = n()) %>%
  ungroup() %>% 
  filter(n > 1) %>%
  select(parcelnum, address, legaldesc, row_num)
duplicate_parcel_numbers_in_parcel_data

Plotting the parcel data suggests that, within groups of parcels that have the same parcel number, some of the parcels cover the same area while others are disjoint. We thus a apply a spatial join, using sf::st_join, to find pairs of parcels that cover the same area. After that, we modify parcel_sf, to be maximal set that contains none of these duplicates.

#join the result with itself, on the basis of sameness of identity of spatial identity 
spatial_repeats <- st_join(duplicate_parcel_numbers_in_parcel_data,
                           duplicate_parcel_numbers_in_parcel_data, 
                           st_equals, left = FALSE) %>% filter(row_num.x != row_num.y)

#select one element from each group with the sf objects for which the polygon covers the same area
selection_vector <- 1:nrow(spatial_repeats)
for (index in 1:nrow(spatial_repeats)) {
  selection_vector[index] <- !(spatial_repeats$row_num.x[index] %in%
                                spatial_repeats$row_num.y[1:index]) 
}
selection_vector <- as.logical(selection_vector)

#the unique parcels, as specified in unique_parcels
unique_parcels <- spatial_repeats[selection_vector,]

#the unique parcels, as specified in parcel_sf, with a row number added to the end of the parcel number (character vector)
unique_parcels <- parcel_sf %>% filter(row_num %in% unique_parcels$row_num.x) %>%
  mutate(parcelnum = str_c(parcelnum, "_", row_number()))

#cut out the set of sf objects that have spatial repeats  
parcel_sf <- parcel_sf %>% filter(!(row_num %in% spatial_repeats$row_num.x))

#bind the the set of unique spatial objects to parcel_sf
parcel_sf <- parcel_sf %>% rbind(unique_parcels)

Having cut out the duplicate parcels (parcels that cover the same area), we plot the groups (as it turns out, pairs) that that have the same parcel number.

library(tidyverse)
library(sf)

repeated_parcel_numbers <- parcel_sf %>% group_by(parcelnum) %>% 
  mutate(n = n()) %>% filter(n > 1) %>% ungroup()
repeated_parcel_numbers <- repeated_parcel_numbers %>% select(parcelnum) %>% 
  mutate(rownumber = row_number()) %>% mutate(plotted = FALSE)

for (row_1 in repeated_parcel_numbers$rownumber) {
  if (repeated_parcel_numbers$plotted[row_1] == FALSE) {
    for (row_2 in repeated_parcel_numbers$rownumber) {
      if (row_2 != row_1 & 
          repeated_parcel_numbers$parcelnum[row_1] == repeated_parcel_numbers$parcelnum[row_2]) {
        print(ggplot(repeated_parcel_numbers %>% filter(rownumber %in% c(row_1, row_2)) %>%
                     select(parcelnum)) + geom_sf() + ggtitle(repeated_parcel_numbers$parcelnum[row_1]))
        repeated_parcel_numbers[c(row_1, row_2),]$plotted<- TRUE 
      }
    }
  }
}

rm(repeated_parcel_numbers)

Having verified that the repeats of the parcel numbers represent distinct parcels, modify the parcel numbers to make them distinct.

#add iteration numbers to parcelnum (a character variable) in the repeats within each set of repeasts
parcel_sf <- parcel_sf %>% group_by(parcelnum) %>% mutate(n = n(), repeat_number = row_number()) %>% 
  ungroup() %>%
  mutate(parcelnum = ifelse(n > 1, str_c(parcelnum, "_", repeat_number), parcelnum)) %>%
  select(-n, -repeat_number)

#test for repeats
temp <- parcel_sf %>% as.data.frame() %>% select(-geometry) %>% 
  group_by(parcelnum) %>% mutate(n = n()) %>% filter(n > 1)
temp

We now now use sf::st_join, with st_overlaps, to test for parcel overlap, with the result, according to the test, that there are several thousand pairs of parcels that overlap (share at least some portion of interior area).

#check for overlap of the parcels in parcel_sf
spatial_overlaps <- st_join(parcel_sf, parcel_sf, 
        st_overlaps, left = FALSE) %>% filter(row_num.x != row_num.y)
## although coordinates are longitude/latitude, st_overlaps assumes that they are planar
nrow(spatial_overlaps)
## [1] 9194

Investigating the cases of overlap (according to st_overlaps) with plots of random selections of the supposedly overlapping pairs suggests that the apparent overlap is not significant (see below). It may be due to the fact that st_join treats latitude and longitude values as planar coordinates. I will assume that parcels do not overlap.

#select a random sample of these cases of two parcels that overlap
set.seed(55) 
sample <- sample(1:nrow(spatial_overlaps), 20)
parcel_selection <- spatial_overlaps[sample,]

#plot the elements of the random sample of pairs for which st_overlaps had indicated an overlap
for (index in 1:nrow(parcel_selection)) {
  row_1 <- parcel_sf %>% select(parcelnum) %>% 
    filter(parcelnum == parcel_selection$parcelnum.x[index])
  row_2 <- parcel_sf %>% select(parcelnum) %>% 
    filter(parcelnum == parcel_selection$parcelnum.y[index])
  print(ggplot(rbind(row_1, row_2)) + geom_sf(aes(fill = parcelnum)))
}

rm(duplicate_parcel_numbers_in_parcel_data, selection_vector, unique_parcels, row_1, row_2, spatial_repeats, index, sample, parcel_selection, spatial_overlaps)
#(note that eval is set to false, to avoid rerunning this code every time I implement "run all chunks")

#investagate a few of the cases by road map 
raod_map <- function(sf_shape) {
  df <- tibble(longitude = st_coordinates(sf_shape)[,1],
               latitude = st_coordinates(sf_shape)[,2])
  
  #left/bottom/right/top for bounding box
  bounding_box <- c(min(df$longitude) - 0.002, min(df$latitude) - 0.002, 
                    max(df$longitude) + 0.002, max(df$latitude) + 0.002)
  detroit_gg <- get_map(location = bounding_box, 
                        maptype = "roadmap")
  ggmap(detroit_gg)
}

street_map_1 <- raod_map(parcel_sf %>% filter(parcelnum == "13000116.003"))
## Warning: bounding box given to google - spatial extent only approximate.
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=42.341003,-83.011428&zoom=17&size=640x640&scale=2&maptype=roadmap&language=en-EN
street_map_1 + ggtitle("Parcel 13000116.003") + theme(plot.title = element_text(hjust = 0.5))

#investigate a few of the cases by satelite map
satellite_map <- function(sf_shape) {
  df <- tibble(longitude = st_coordinates(sf_shape)[,1],
               latitude = st_coordinates(sf_shape)[,2])
  
  #left/bottom/right/top for bounding box
  bounding_box <- c(min(df$longitude) - 0.002, min(df$latitude) - 0.002, 
                    max(df$longitude) + 0.002, max(df$latitude) + 0.002)
  detroit_gg <- get_map(location = bounding_box, 
                        maptype = "satellite")
  ggmap(detroit_gg)
}

street_map_2 <- satellite_map(parcel_sf %>% filter(parcelnum == "13000116.003"))
## Warning: bounding box given to google - spatial extent only approximate.
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=42.341003,-83.011428&zoom=17&size=640x640&scale=2&maptype=satellite&language=en-EN
street_map_2 + ggtitle("Parcel 13000116.003") + theme(plot.title = element_text(hjust = 0.5))

We begin the process of creating a list of buildings from the parcels data. One of the issues to consider is number of buildings on the parcel. Should we restrict our analysis to those parcels that have only one building? In any case, two columns in parcel_sf bear on this aspect: building_s and num_buildi.

#exploration of the numbers of buildings on parcels
buildings_1 <- parcel_sf %>% filter(!is.na(building_s))
levels(buildings_1$building_s)
##  [1] "1/2 DUPLEX"      "2 STY COLONIAL"  "APARTMENT"      
##  [4] "APT FLAT GARDEN" "APT HIGH RISE"   "CARRIAGE HOUSE" 
##  [7] "DUPLEX"          "FOUR FAMILY"     "GARAGE/SHED"    
## [10] "INCOME BUNGALOW" "LARGE FLATS"     "LOFT APT WALKUP"
## [13] "MULTI DWELLING"  "ROW HOUSE"       "SINGLE FAMILY"  
## [16] "TWO FAMILY FLAT"
buildings_2 <- parcel_sf %>% filter(num_buildi > 0)

buildings_3 <- parcel_sf %>% filter(!is.na(building_s) & num_buildi > 0)

buildings_4 <- parcel_sf %>% filter(is.na(building_s) & num_buildi > 0)
nrow(buildings_4)
## [1] 18749
buildings_5 <- parcel_sf %>% filter(!is.na(building_s) & !num_buildi > 0)

#print, but first cut out the geometry column to avoid an error message.
as.data.frame(buildings_5) %>% select(-geometry)

building_5 contains zero rows, thus indicating that buildings_1 is a subset of building_2. On the other hand, buildings_4–the set of parcels with, according to the data, at least one building but with the building-type unspecified (NA in building_s) contains 18,749 rows.

buildings_6 <- parcel_sf %>% filter(num_buildi > 1)

ggplot(buildings_1) + geom_bar(aes(x = as.factor(num_buildi)))

ggplot(buildings_4) + geom_bar(aes(x = as.factor(num_buildi)))

ggplot(buildings_6) + geom_bar(aes(x = building_s))

related <- buildings_1 %>% filter(!is.na(related_pa))
set.seed(27)
sample <- sample_n(related, 20)
dfs <- list()
for (index in 1:nrow(sample)) {
  row_1 <- sample[index,]
  row_2 <- parcel_sf %>% filter(parcelnum == sample$related_pa[index])
  df <- rbind(row_1, row_2)
  print(ggplot(df) + geom_sf(aes(fill = parcelnum)))
  dfs[[index]] <- df
}

as.data.frame(dfs[[2]])
rm(buildings_1,buildings_2, buildings_3, buildings_4, buildings_5, buildings_6)

rm(row_1, row_2, sample, related, dfs, df)

I will ignore the “related parcels” (related_pa) in parcel_sf. Before making the final call as to what subset of the parcels dataframe will provide the stand-in for buildings, I will work on the data that we will use to form the labels.

Like the parcel_sf dataframe, the dismantle permits data also contains some duplicate parcel numbers over rows, in some cases over rows that contain address information suggesting that the location is different. (It also indicates that a few individual locations, identified with addresses, had more than one associated dismantle permit. This need not be problematic—a permit could expire before the work is carried out, or there could be more than one structure on a parcel.) These repetitions are in duplicate_parcel_numbers_over_distinct_addresses, which contains 272 rows. I should also note that, in most of these cases with duplicate parcel numbers (and addresses indicating different locations), the recorded latitude and longitude are identical. We can thus infer that some of this location information is incorrect.

#repeated parcel numbers in the dismantle permits data
dup_par_num_in_dismantle_data <-
  dismantle_permits_sf %>%
  group_by(`Parcel Number`) %>% 
  mutate(n = n()) %>%
  ungroup() %>% 
  filter(n > 1) %>%
  select(`Parcel Number`, `Site Address`) %>%
  arrange(`Parcel Number`)
rm(dup_par_num_in_dismantle_data)

#parcel numbers in the dismantle permits data that are distributed over disinct address strings
dup_par_num_over_distinct_addresses <- 
  dismantle_permits_sf %>%
  group_by(`Parcel Number`) %>%
  mutate(parcel_number_occurances = n()) %>%
  ungroup() %>%
  filter(parcel_number_occurances > 1) %>%
  group_by(`Parcel Number`, `Site Address`) %>%
  mutate(m = n()) %>%
  filter(m < parcel_number_occurances) %>%
  arrange(`Parcel Number`)
#remove extraneous variables and then geocode the dismantle site addresses
geocoded_duplicates <- dup_par_num_over_distinct_addresses %>%
  as.data.frame %>%
  select(-geometry) %>%
  mutate(location = map(str_c(`Site Address`, ", Detroit, Michigan"), geocode_pause))
  
#avoid having to do the geocoding again when re-running the code
write_rds(geocoded_duplicates, "./data/geocoded_duplicates.rds")  

Let’s have a look at the relationship between the numbers of buildings on the parcels, as indicated in parcel_sf, and the numbers of times parcels occur in dup_par_num_over_distinct_addresses (which was constructed from the dismantle permits data)

#read the saved geocoded file into R
geocoded_duplicates <- read_rds("./data/geocoded_duplicates.rds")

#unpack the column of dataframes created by the geocoding, remove rows with identical geocoding results, and convert the resulting dataframe into a set of sf objects
geocoded_duplicates <- remove_null_locations(geocoded_duplicates) %>% 
  unnest(location) %>% distinct(lon, lat, .keep_all = TRUE) %>%
  st_as_sf(coords = c("lon", "lat"), crs = st_crs(parcel_sf))

#ivestigate the relatinship between number buildings on a parcel, as specified in parcel_sf, and number of occurrences of parcel numbers in geocoded_duplicates
joined_w_parcels_sf <- as.data.frame(geocoded_duplicates) %>% 
  group_by(`Parcel Number`) %>% 
  summarize(`number of locations` = n()) %>%
  inner_join(as.data.frame(parcel_sf), by = c("Parcel Number" = "parcelnum")) 
ggplot(joined_w_parcels_sf) +
  geom_count(aes(x = as.factor(`number of locations`),
                 y = num_buildi))

#cut out the greater values of num_buildi, so as to widen the vertical scale
ggplot(joined_w_parcels_sf %>% filter(num_buildi < 11)) +
  geom_count(aes(x = as.factor(`number of locations`),
                 y = as.factor(num_buildi)))

Note the large number of cases for which num_buildi is equal to zero. If the data is accurate, then it is a reasonable guess that such cases reflect parcels for which num_build has been updated after all buildings have have been removed.

temp3 <- geocoded_duplicates %>% #select(`Parcel Number`, location, `Permit Location`) %>%
  filter(`Parcel Number` == "13006809.")

#function for putting a set of of sf point on a satelite map
sf_points_map <- function(sf_df) {
  df <- sf_df %>% mutate(longitude = st_coordinates(sf_df)[,1],
                                latitude = st_coordinates(sf_df)[,2]) %>%
  as.data.frame %>% select(longitude, latitude)
  
  #left/bottom/right/top for bounding box
  bounding_box <- c(min(df$longitude) - 0.002, min(df$latitude) - 0.002, 
                    max(df$longitude) + 0.002, max(df$latitude) + 0.002)
  detroit_gg <- get_map(location = bounding_box, 
                        maptype = "satellite")
  ggmap(detroit_gg) + geom_point(data = df, aes(x = longitude, y = latitude),
                                 color = "red") 
}

#map of points with Parcel Number listed as "13006809."
sf_points_map(temp3)
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=42.410747,-83.043346&zoom=17&size=640x640&scale=2&maptype=satellite&language=en-EN

#note that "13006809." is not reflected as a parcel number in parcel_sf (noting that parcel_sf$parcelnum is an integer vector).
nrow(parcel_sf %>% filter(parcelnum == "13006809."))
## [1] 0
#spacial join to see where these points in the dismantled dataset hook up with the parcels dataset, with the result that the parcel contains three of seven points with `Parcel Number` = "13006809.". The are nearby, although some of may be closer to other parcels.
temp4 <- st_join(parcel_sf, temp3, left = FALSE, st_contains)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
temp4$`Parcel Number`
## [1] "13006809." "13006809." "13006809."
as.numeric(as.character(temp4$parcelnum))
## [1] 13006809 13006809 13006809
temp5 <- parcel_sf %>%
  filter((!as.numeric(as.character(parcelnum)) < 13006809) &
          as.numeric(as.character(parcelnum)) < 13006810) %>%
  select(parcelnum)
as.data.frame(temp5$parcelnum)
#plot with parcel "13006809." on a satelite map
sf_points_map(temp3) +  
  geom_sf(data = temp5 %>% select(geometry), crs = 3857, inherit.aes = FALSE)
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=42.410747,-83.043346&zoom=17&size=640x640&scale=2&maptype=satellite&language=en-EN
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

rm(temp3, temp4, temp5, joined_w_parcels_sf, geocoded_duplicates)
rm(demolition_sample, geocoded_duplicates)
rm(temp, temp2)

It is notable that one of the positions associated with the dismantle permits is far enough from the the two parcels to be potentially closer to another parcel, which (upon further investigation with Google maps) appears to be adjacent to a vacant lot, on which a building, now dismantled, may have sat.

Further investigation of Parcel Number duplicates in the dismantle permits data:

#read in the saved set of examples in the dismantle permits data that involve repeats of parcel numbers over different address strings.
dismantle_duplicates_geocoded <- read_rds("./data/geocoded_duplicates.rds")

#unpack the column of data frames (the outputs of geocode())
dismantle_duplicates_geocoded <- remove_null_locations(dismantle_duplicates_geocoded) %>%
   unnest(location)

#note the 16 rows of dismantle_duplicates_geocoded for which another row lists the same longitude and latitude
dismantle_duplicates_geocoded %>% 
  group_by(lon, lat) %>% mutate(n = n()) %>% filter(n > 1)
#keep only the first of any group of rows with the same longitude and latitude
dismantle_duplicates_geocoded <-
  dismantle_duplicates_geocoded %>% distinct(lon, lat, .keep_all = TRUE)

head(dismantle_duplicates_geocoded)
#manually cut out the one remaining apparent duplicate location
dismantle_duplicates_geocoded <- dismantle_duplicates_geocoded %>% 
  filter(!(`Site Address` == "3200 E LAFAYETTE-MARTIN LUTHER KING HIGH"))

dismantle_duplicates_geocoded <- dismantle_duplicates_geocoded %>%
  st_as_sf(coords = c("lon", "lat"), crs = st_crs(parcel_sf)) %>%
  select(-parcel_number_occurances, -m)

#where possible, find the parcels that contain the positions corresponding to the coordinates
spacial_join_within <- 
  st_join(dismantle_duplicates_geocoded, 
          parcel_sf %>% select(parcelnum), 
          st_within)
## although coordinates are longitude/latitude, st_within assumes that they are planar
#change the parcel numbers (for the dismantle permits data) to those for the parcels that contain the parcels specified by the coordinates. remove the parcelnum column (from the parcel_sf data frame)
dismantle_duplicates_geocoded <- spacial_join_within %>%
  mutate(`Parcel Number` = ifelse(!is.na(parcelnum),
                                  as.character(parcelnum),
                                  `Parcel Number`)) %>%
  select(-parcelnum)

#replace rows with duplicates of parcel numbers over distinct addresses with the cleaned set of rows, as per above
dismantle_permits_sf <- dismantle_permits_sf %>% 
  filter(!(`Parcel Number` %in% dup_par_num_over_distinct_addresses$`Parcel Number`)) %>%
  rbind(dismantle_duplicates_geocoded)

rm(dismantle_duplicates_geocoded, dup_par_num_over_distinct_addresses, spacial_join_within)
rm(index)

Using Google Street View to investigate some more dismantle permit entries for which there are other entries with the same parcel number but different addresses, we find mostly different locations at the same building (e.g. the two sides of a duplex), or perhaps two attached buildings.

Continuing with the matter of assigning labels to the buildings, we now look at the completed demolitions data, completed_demolitions_sf. Although the initial idea was to use the dismantle permits data to assign the labels—blighted (to be dismantled) and not blighted (not to be dismantled)—we should consider using the completed demolitions data instead or in addition. Although this data may omit blighted buildings which, for whatever reason, have not been demolished or which, as suggested on the City-of-Detroit web-page from which the file was downloaded, may have been demolished in a hurry, the data appears to be relatively clean and genuinely reflective and blightedness. For example, if a new property owner dismantled a well-kept building for some reason, say to build a larger home, then, it may may seem, the demolition of this building would be reflected in the dismantle-permits dataset but not in the completed-demolitions dataset.

#look for repeats of "Parcel ID" on completed_demolitions_sf, finding none
as.data.frame(completed_demolitions_sf) %>%
  select(-geometry) %>%
  group_by(`Parcel ID`) %>%
  mutate(n = n()) %>%
  filter(n > 1)
#check to see how this data joins with the parcels data, noting that the inner join contains exactly seven rows less than completed_demolitions_sf contains
demolition_join_on_parcels <- as.data.frame(parcel_sf) %>% filter(!is.na(parcelnum)) %>%
  inner_join(as.data.frame(completed_demolitions_sf %>% filter(!is.na(`Parcel ID`))),
                                         by = c("parcelnum" = "Parcel ID"))

#look for repeats of parcelnum in demolition_join_on_parcels, finding none. we thus find near total agreement between the parcel_sf data frame and the completed demolitions data frame--given the information in parcel_sf, the parcel numbers in the completed demolitions dataset appear to be consistent with the location data.
as.data.frame(demolition_join_on_parcels) %>%
  select(-geometry.x, -geometry.y) %>%
  group_by(parcelnum) %>%
  mutate(n = n()) %>%
  filter(n > 1)
#have a look at the seven rows of completed_demolitions for which the `Parcel ID` did not match with a value of parcelnum in parcel_sf
dismantled_parcels_left_out <- completed_demolitions_sf %>% 
  filter(!`Parcel ID` %in% demolition_join_on_parcels$parcelnum)

#link these to the parcels data set, parcel_sf, using a spacial join
dismantled_left_out_joined <- dismantled_parcels_left_out %>%
  st_join(parcel_sf %>% select(geometry), st_covered_by, left = FALSE)
## although coordinates are longitude/latitude, st_covered_by assumes that they are planar
rm(demolition_sample, demolition_join_on_parcels, dismantled_left_out_joined, dismantled_parcels_left_out)
## Warning in rm(demolition_sample, demolition_join_on_parcels,
## dismantled_left_out_joined, : object 'demolition_sample' not found

In an effort to make our predictors propertly relevant to what they are predicting, we will restrict our labels to the period from June of 2016. We thus need to convert some of the date information, which is in the form of a string, into a proper date format. Although three datasets are being used to the construct our labels, the conversion will be necessary for only two of the datasets.

require(lubridate)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
completed_demolitions_sf$`Demolition Date` <- mdy(completed_demolitions_sf$`Demolition Date`)

dismantle_permits_sf$`Permit Issued` <- mdy(dismantle_permits_sf$`Permit Issued`)

Our parcel list, the standins for buildings, will be formed from those parcels that satisfy one or more of the following conditions:

Having determined that the great majority of the parcels have only one building, we may guess that that the existence of a few parcels with repeats won’t have much of a distorting affect.

After construction of a list of parcels, we proceed to a set of labels—blighted or not blighted—for each parcel, based on the the building’s either having been dismantled or having had a dismantle permit associated with it. In associating dismantle permits and actual demolitions with individual parcels, I have prioritized spatial association (using sf::st_join) over matching of parcel numbers. (That is, in the limited number of cases where these methods of association disagree, we use the spatial association.)

#the set of all rows of parcel_sf that are within one of the hardest hit areas
parcels_in_hardest_hit_areas <- parcel_sf %>%
  st_join(Hardest_Hit_Fund_Areas, st_within, left = FALSE)
## although coordinates are longitude/latitude, st_within assumes that they are planar
#add a row number to completed_demolitions_sf, so as to keep an account of which rows have been included in the following joins
completed_demolitions_sf <- completed_demolitions_sf %>% mutate(rownumber = row_number())

#inner spacial join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis containment of the point associated with the demolition dataset, in the parcel 
completed_in_fund_areas_parcel_spatial <- parcels_in_hardest_hit_areas %>%
  st_join(completed_demolitions_sf, st_contains, left = FALSE)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
completed_in_fund_areas_parcel_spatial <- completed_in_fund_areas_parcel_spatial %>%
  filter(`Demolition Date` > mdy("05/31/2016"))

#inner join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis of identify of parcel numbers, for those elements of completed_demolitions_sf that were not captured in the spatial join
completed_in_fund_areas_parcelnum_join <- parcels_in_hardest_hit_areas %>%
  inner_join(as.data.frame(completed_demolitions_sf %>% 
                             filter(! rownumber %in% completed_in_fund_areas_parcel_spatial$rownumber)), 
             by = c("parcelnum" = "Parcel ID"))

completed_in_fund_areas_parcelnum_join <- completed_in_fund_areas_parcelnum_join %>%
  filter(`Demolition Date` > mdy("05/31/2016"))

#add a row number to completed_demolitions_sf, so as to keep an account of which rows have been included in the following joins
upcoming_demolitions_sf <- upcoming_demolitions_sf %>% mutate(rownumber = row_number())

#inner spacial join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis containment of the point associated with the demolition dataset, in the parcel 
upcoming_in_fund_areas_parcel_spatial <- parcels_in_hardest_hit_areas %>%
  st_join(upcoming_demolitions_sf, st_contains, left = FALSE)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
#inner join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis of identify of parcel numbers, for those elements of completed_demolitions_sf that were not captured in the spatial join
upcoming_in_fund_areas_parcelnum_join <- parcels_in_hardest_hit_areas %>%
  inner_join(as.data.frame(upcoming_demolitions_sf %>% 
                             filter(! rownumber %in% completed_in_fund_areas_parcel_spatial$rownumber)), 
             by = c("parcelnum" = "Parcel ID"))

#repeat the above three steps for the parcels that have dismantle permits associated with them
dismantle_permits_sf <- dismantle_permits_sf %>% mutate(rownumber = row_number())
  
dismantle_permit_fund_areas_spatial <- parcels_in_hardest_hit_areas %>%
  st_join(dismantle_permits_sf, st_contains, left = FALSE)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
dismantle_permit_fund_areas_spatial <- dismantle_permit_fund_areas_spatial %>%
  filter(`Permit Issued` > mdy("05/31/2016"))

dismantle_permit_fund_areas_parcelnum_join <- parcels_in_hardest_hit_areas %>%
  inner_join(as.data.frame(dismantle_permits_sf %>%
                             filter(! rownumber %in% dismantle_permit_fund_areas_spatial$rownumber)), 
             by = c("parcelnum" = "Parcel Number"))

dismantle_permit_fund_areas_parcelnum_join <- dismantle_permit_fund_areas_parcelnum_join %>%
  filter(`Permit Issued` > mdy("05/31/2016"))

#data frame with the parcel numbers of all parcels that have been at least one dismantled building or dismantle permit associated with it.
blighted_parcels <- rbind(as.data.frame(completed_in_fund_areas_parcel_spatial) %>% select(parcelnum), 
                          as.data.frame(completed_in_fund_areas_parcelnum_join) %>% select(parcelnum),
                          as.data.frame(dismantle_permit_fund_areas_spatial) %>% select(parcelnum),
                          as.data.frame(dismantle_permit_fund_areas_parcelnum_join) %>% select(parcelnum),
                          as.data.frame(upcoming_in_fund_areas_parcel_spatial) %>% select(parcelnum), 
                          as.data.frame(upcoming_in_fund_areas_parcelnum_join) %>% select(parcelnum)) %>%
  distinct(parcelnum)

#data frame with the parcel numbers for all parcels in the hardest hit areas that have or have had at least one buiding on the parcel
parcels_set <- as.data.frame(blighted_parcels) %>% 
  select(parcelnum) %>%
  rbind(as.data.frame(parcels_in_hardest_hit_areas) %>% 
          filter(num_buildi > 0) %>% select(parcelnum)) %>%
  distinct(parcelnum)

#our labels
labels <- parcels_set %>% mutate(blighted = ifelse(parcelnum %in% blighted_parcels$parcelnum, 1, 0))

#note the ballance of blighted versus not-blighted labels.
labels %>% count(blighted) %>% mutate(fraction = n / sum(n))
rm(completed_in_fund_areas_parcel_spatial, completed_in_fund_areas_parcelnum_join, dismantle_permit_fund_areas_spatial, dismantle_permit_fund_areas_parcelnum_join, parcels_set, blighted_parcels, dismantle_permit_fund_areas_parcelnum_join, dismantle_permit_fund_areas_spatial, completed_in_fund_areas_parcelnum_join, completed_in_fund_areas_parcel_spatial, parcels_in_hardest_hit_areasparcels_in_hardest_hit_areasparcels_in_hardest_hit_areas, upcoming_in_fund_areas_parcel_spatial, upcoming_in_fund_areas_parcelnum_join)

We now construct a balanced set of labels, with roughly equal numbers of blighted and non-blighted examples. We then divide the set of parcel numbers into training and test sets, on a ratio of 0.8 to 0.2. Note that we will be using 10-fold cross validation on the training set, with the 20 percent of all of the items set asside for a final test of our model.

#note the ballance of blighted versus not-blighted labels.
labels %>% count(blighted) %>% mutate(fraction = n / sum(n))
labels_balanced <- labels %>% filter(blighted == 0) %>% sample_n(2200) %>%
  rbind(labels %>% filter(blighted == 1))

set.seed(82)
training_rows <- caret::createDataPartition(labels_balanced$blighted, p = 0.85, list = FALSE)
training_parcelnums <- labels_balanced[training_rows,]$parcelnum
testing_parcelnums <- labels_balanced[-training_rows,]$parcelnum

Having explored and cleaned the data and constructed a set of labels, we proceed to the construction of predictive models. The first step will be to add some properties to the set of labels.

library(lubridate)

#add the parcel geometry to the labels
parcels_with_labels <- parcel_sf %>% select(parcelnum) %>%
  inner_join(labels_balanced, by = "parcelnum")

#check to see that there are no parcel number repeats (finding none)
parcels_with_labels %>% as.data.frame() %>% select(-geometry) %>% count(parcelnum) %>% filter(n > 1)
#note that `Ticket ID` provides a key for blight_violations_sf (temp has zero elements)
temp <- blight_violations_sf %>% as.data.frame() %>% select(-geometry) %>%
  count(`Ticket ID`) %>% filter(n > 1)
temp
rm(temp)

blight_violations_sf <- blight_violations_sf %>% 
  filter(mdy(`Violation Date`) > mdy("05/31/2009"))

#have a look at the distribution of violators recorded in blight_violations_sf for (as indicated by `Violator ID`), noting that the following construction contains zero rows (Violator ID thus doesn't seem to be useful)
violator_distribution <- blight_violations_sf %>% 
  as.data.frame() %>% select(-geometry) %>% count(`Violator ID`) %>% filter(n > 1)
violator_distribution
rm(violator_distribution)

#associate the blight violations data with parcels_with_labels, first with a spatial join and then, for any rows in blight_violations_sf for which the spatial association fails, with an inner join on the parcel numbers. recall that parcels_with_labels is restrcicted to the hardest hit areas
spatial_join <- parcels_with_labels %>% 
  st_join(blight_violations_sf, st_contains, left = FALSE) %>%
  select(-`Violation Parcel ID`)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
#make use of the key `Ticket ID` to find the leftovers
blight_violations_leftovers <- blight_violations_sf %>% filter(! `Ticket ID` %in% spatial_join$`Ticket ID`)

#now handle the leftovers
parcelnum_join <- parcels_with_labels %>% 
  inner_join((as.data.frame(blight_violations_leftovers) %>% select(-geometry)),
             by = c("parcelnum" = "Violation Parcel ID"))

blight_violation_data <- rbind(spatial_join, parcelnum_join)

blight_violation_data <- inner_join(parcels_with_labels, 
                                     as.data.frame(blight_violation_data) %>% select(-geometry, -blighted), 
                                     by = "parcelnum")

rm(spatial_join, blight_violations_leftovers, parcelnum_join)

We consider a histogram of the blight data

#dataset with two calculated variables--total number of blight violations and tot4 perhaps to be used as predictors
blight_violation_tallies <- blight_violation_data %>% as.data.frame() %>% select(-geometry) %>%
  mutate(`Fine Amount` = as.numeric(str_sub(`Fine Amount`, start = 2))) %>%
  group_by(parcelnum, blighted) %>%
  summarize(number_of_violations_by_parcel = n(), total_fines_by_parcel = sum(`Fine Amount`))

blight_violation_tallies <- as.tibble(blight_violation_tallies)

parcels_without_blight_violations <- labels_balanced %>% 
  filter(!parcelnum %in% blight_violation_tallies$parcelnum) %>%
  mutate(number_of_violations_by_parcel = 0, total_fines_by_parcel = 0)

blight_violation_tallies <- bind_rows(blight_violation_tallies, parcels_without_blight_violations)

blight_violation_summary <- blight_violation_tallies %>% 
  filter(number_of_violations_by_parcel < 19) %>%
  group_by(number_of_violations_by_parcel, blighted) %>% summarize(n = n()) 

ggplot(blight_violation_summary) + 
  geom_col(aes(x = number_of_violations_by_parcel,
               y = n, 
               fill = as.factor(blighted)),
           position = "dodge2")

rm(blight_violation_summary)  

Now consider a histogram of the fine totals.

head(blight_violation_tallies)
blight_violation_tallies %>% filter(total_fines_by_parcel < 5000) %>% 
  ggplot(aes(x = total_fines_by_parcel, fill = as.factor(blighted))) + 
  geom_histogram(binwidth = 200)

We will next construct a simple logistic model, using the constructed dataframe that we used for the two plots above. The two predictors will be the number of blight violaitons and the total amount in fines.

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- blight_violation_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

models <- 1:10 %>% map(~ glm(blighted ~ total_fines_by_parcel + number_of_violations_by_parcel,
                             data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.5759393
sd(as.numeric(accuracies))
## [1] 0.02432749
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 163 136
##   TRUE   24  41
##        truth
## pred      0   1
##   FALSE 155 118
##   TRUE   37  54
##        truth
## pred      0   1
##   FALSE 166 131
##   TRUE   22  46
##        truth
## pred      0   1
##   FALSE 154 142
##   TRUE   23  45
##        truth
## pred      0   1
##   FALSE 156 125
##   TRUE   30  53
##        truth
## pred      0   1
##   FALSE 161 129
##   TRUE   20  54
##        truth
## pred      0   1
##   FALSE 176 130
##   TRUE   18  40
##        truth
## pred      0   1
##   FALSE 165 105
##   TRUE   33  61
##        truth
## pred      0   1
##   FALSE 143 145
##   TRUE   24  52
##        truth
## pred      0   1
##   FALSE 160 121
##   TRUE   31  52
#construct a grid of predictor values and then represent the predictions with colors
library(modelr)
number_of_violations_by_parcel <- 
  seq_range(train$number_of_violations_by_parcel, n = 10, pretty = TRUE, trim = 0.05)
total_fines_by_parcel <- seq_range(train$total_fines_by_parcel, n = 10, pretty = TRUE, trim = 0.05)

grid <- crossing(number_of_violations_by_parcel,
                 total_fines_by_parcel)

#have a look at the third model (among the k-fold validation models)
grid <- grid %>% add_predictions(models[[3]])

ggplot(grid) + geom_point(aes(x = number_of_violations_by_parcel, 
                              y = total_fines_by_parcel, color = pred > 0.5))

Note that the models are much better at predicting negative instances (truth = 0) instances than predicting positive instances (true = 1). (It gets most of the positive instances wrong.)

We now compare this with a simple rpart (decision tree) model

library(rpart)

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- blight_violation_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

models <- 1:10 %>% map(~ rpart(blighted ~ total_fines_by_parcel + number_of_violations_by_parcel,
                             data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.5951626
sd(as.numeric(accuracies))
## [1] 0.03001074
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 158 100
##   TRUE   28  78
##        truth
## pred      0   1
##   FALSE 146  97
##   TRUE   40  81
##        truth
## pred      0   1
##   FALSE 143  99
##   TRUE   44  79
##        truth
## pred      0   1
##   FALSE 141 108
##   TRUE   45  70
##        truth
## pred      0   1
##   FALSE 142  99
##   TRUE   44  79
##        truth
## pred      0   1
##   FALSE 145 111
##   TRUE   41  67
##        truth
## pred      0   1
##   FALSE 139 105
##   TRUE   47  73
##        truth
## pred      0   1
##   FALSE 142  98
##   TRUE   44  80
##        truth
## pred      0   1
##   FALSE 138 111
##   TRUE   48  67
##        truth
## pred      0   1
##   FALSE 132 111
##   TRUE   54  67
#construct a grid of predictor values and then represent the predictions with colors
library(modelr)
number_of_violations_by_parcel <- 
  seq_range(train$number_of_violations_by_parcel, n = 10, pretty = TRUE, trim = 0.05)
total_fines_by_parcel <- seq_range(train$total_fines_by_parcel, n = 10, pretty = TRUE, trim = 0.05)

grid <- crossing(number_of_violations_by_parcel,
                 total_fines_by_parcel)

#have a look at the third model (among the k-fold validation models)
grid <- grid %>% add_predictions(models[[9]])

#plot of the predictions, for one of the models, 
ggplot(grid) + geom_point(aes(x = number_of_violations_by_parcel, 
                              y = total_fines_by_parcel, color = pred[,2] > 0.5))

models[[4]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 3277 1602 0 (0.5111382 0.4888618)  
##   2) total_fines_by_parcel< 35 2216  931 0 (0.5798736 0.4201264) *
##   3) total_fines_by_parcel>=35 1061  390 1 (0.3675778 0.6324222) *

Note that the simple decision-tree model is somewhat better than the logistic model at teasing out the positive instances, and somewhat less effective with respect to the negative instances.

We now add the improve_detroit_issues dataset to the model. Note that the improve detroit issues dataset does not contain data for 2013 or earlier.

#note the possible issue types, and then restrict the set to include issues that would seem to be associated with blight in a particular building
levels(as.factor(improve_detroit_issues$`Issue Type`))
##  [1] "Blocked Catch Basin"                          
##  [2] "Cemetery Issue"                               
##  [3] "Clogged Drain"                                
##  [4] "Curbside Solid Waste Issue"                   
##  [5] "Dead Animal Removal"                          
##  [6] "DPW - Debris Removal - DPW USE ONLY"          
##  [7] "DPW - Other environmental - DPW USE ONLY"     
##  [8] "Fire Hydrant Issue"                           
##  [9] "Illegal Dump Sites"                           
## [10] "Manhole Cover Issue"                          
## [11] "New LED Street Light Out"                     
## [12] "Other - Not within City jurisdiction"         
## [13] "Other - Not within scope of City services"    
## [14] "Other - Referred to other City Department"    
## [15] "Park Issue"                                   
## [16] "Potholes"                                     
## [17] "Residential Snow Removal Issue"               
## [18] "Rodent Extermination - BSEED Only"            
## [19] "Running Water in a Home or Building"          
## [20] "Squatters Issue"                              
## [21] "Squatters Issue - TEST ONLY"                  
## [22] "Street Light / Street Light Pole Major Repair"
## [23] "Street Light Out"                             
## [24] "Street Light Pole Down"                       
## [25] "Traffic Sign Issue"                           
## [26] "Traffic Signal Issue"                         
## [27] "Tree Issue"                                   
## [28] "Water Main Break"
#we'll try to work with the folloiwng dataset, which has 
improve_detroit_issues_property <- improve_detroit_issues %>%
  filter(`Issue Type` %in% c("Residential Snow Removal Issue", "Running Water in a Home or Building", 
                             "Curbside Solid Waste Issue",  "DPW - Debris Removal - DPW USE ONLY", 
                             "Rodent Extermination - BSEED Only", "Squatters Issue", "Illegal Dump Sites"))

#add the parcel geometry to the dataset with the the blight violation tallies
blight_violation_tallies <- parcels_with_labels %>% select(geometry, parcelnum) %>% 
  inner_join(blight_violation_tallies, by = "parcelnum")

#expand the parcel geometry to include all points within 200 meters of the parcel
library(units)
## 
## Attaching package: 'units'
## The following object is masked from 'package:base':
## 
##     %*%
distance <- set_units(200, "m")
blight_violation_tallies <- blight_violation_tallies %>% 
  st_transform(2253) %>% st_buffer(distance)

improve_detroit_issues_tallies <- improve_detroit_issues_property %>% 
  st_transform(2253) %>%
  st_join(blight_violation_tallies, join = st_within) %>% 
  as.tibble() %>% select(-geometry) %>% count(parcelnum)

#note the distribution of the tally numbers
improve_detroit_issues_tallies %>% filter(n < 200) %>% ggplot(aes(x = n)) +
  geom_histogram(bins = 50) + labs(y = "count within 200 meters") +
  ggtitle("Improve Detroit Issues Associated with Properties") + 
  theme(plot.title = element_text(hjust = 0.5))

#add the tallies to the improve_detroit_issues_tallies set
improve_detroit_and_violations_tallies <- blight_violation_tallies %>% as.data.frame() %>% 
  select(-geometry) %>% left_join(improve_detroit_issues_tallies, by = "parcelnum") %>%
  mutate(improve_issues_tallies = ifelse(is.na(n), 0, n)) %>% select(-n)

We add this data to our models, and again try both decision-tree and logistic-regression methods.

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- improve_detroit_and_violations_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + improve_issues_tallies

models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.5784134
sd(as.numeric(accuracies))
## [1] 0.03085671
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 164 135
##   TRUE   15  50
##        truth
## pred      0   1
##   FALSE 163 105
##   TRUE   36  60
##        truth
## pred      0   1
##   FALSE 154 128
##   TRUE   27  56
##        truth
## pred      0   1
##   FALSE 169 128
##   TRUE   20  47
##        truth
## pred      0   1
##   FALSE 141 155
##   TRUE   23  45
##        truth
## pred      0   1
##   FALSE 175 113
##   TRUE   26  50
##        truth
## pred      0   1
##   FALSE 155 126
##   TRUE   31  52
##        truth
## pred      0   1
##   FALSE 168 120
##   TRUE   29  47
##        truth
## pred      0   1
##   FALSE 158 140
##   TRUE   23  43
##        truth
## pred      0   1
##   FALSE 159 130
##   TRUE   25  50
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- improve_detroit_and_violations_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#use the same formula as in the logistic model above
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.5951648
sd(as.numeric(accuracies))
## [1] 0.03045012
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 142 105
##   TRUE   44  73
##        truth
## pred      0   1
##   FALSE 151 105
##   TRUE   35  73
##        truth
## pred      0   1
##   FALSE 149 108
##   TRUE   38  70
##        truth
## pred      0   1
##   FALSE 149 103
##   TRUE   37  75
##        truth
## pred      0   1
##   FALSE 148  99
##   TRUE   38  79
##        truth
## pred      0   1
##   FALSE 149  98
##   TRUE   37  80
##        truth
## pred      0   1
##   FALSE 132 116
##   TRUE   54  62
##        truth
## pred      0   1
##   FALSE 135  98
##   TRUE   51  80
##        truth
## pred      0   1
##   FALSE 132  92
##   TRUE   54  86
##        truth
## pred      0   1
##   FALSE 139 115
##   TRUE   47  63

We now try to work with the crime data

library(units)
library(lubridate)

#improve_detroit_and_violations_tallies <- improve_detroit_and_violations_tallies %>%
#  as.data.frame() %>% select(-geometry)

#add the parcel geometry to the data we are using for modeling
improve_detroit_and_violations_tallies <- parcels_with_labels %>% select(-blighted) %>%
  left_join(improve_detroit_and_violations_tallies, by = "parcelnum") %>% st_sf()

improve_detroit_and_violations_tallies <- improve_detroit_and_violations_tallies %>% st_sf()

distance <- set_units(200, "m")  #for crimes within 200 meters

#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_later_crime <- improve_detroit_and_violations_tallies %>% 
  st_transform(2253) %>% st_buffer(distance)

#ggplot(improve_detroit_and_violations_tallies[100,]) + geom_sf() 

tallies_with_later_crime <- tallies_with_later_crime %>%
  st_join(crime_12062016_to_03192018 %>% st_transform(2253),
          join = st_contains)

#count the number of recent recorded crimes within 500 meters, noting that every parcel has at least one
tallies_with_later_crime <- tallies_with_later_crime %>% as.data.frame() %>% select(-geometry) %>% 
  group_by(parcelnum) %>% summarize(later_recorded_crime_incidents = n())

#now tally the earlier crime incidents, by roughly repeating the last few steps

#add the parcel geometry to the data we are using for modeling
#tallies_with_earlier_crime <- parcels_with_labels %>% select(-blighted) %>%
#  left_join(improve_detroit_and_violations_tallies, by = "parcelnum") %>% st_sf()

#improve_detroit_and_violations_tallies <- improve_detroit_and_violations_tallies %>% st_sf()

earlier_crime_sf <- crime_to_12062016_sf %>% 
  mutate(INCIDENTDATE = mdy(str_sub(INCIDENTDATE, 1, 8))) %>%
  filter(INCIDENTDATE > mdy("12/31/2014")) %>%
  mutate(general_type = case_when(
  CATEGORY %in% c("AGGRAVATED ASSAULT", "ASSAULT", "HOMICIDE", "ROBBERY", 
                  "ARSON", "DAMAGE TO PROPERTY") ~ "Violent Crime",
  CATEGORY %in% c("BURGLARY", "STOLEN PROPERTY", "STOLEN VEHICLE",
                  "OTHER BURGLARY", "LARCENY") ~ "Property Crime",
  CATEGORY %in% c("SEX OFFENSES", "SOLICITATION", "VAGRANCY (OTHER)") ~ "Nuiscance Offences"))

  
#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_earlier_property_crime <- improve_detroit_and_violations_tallies %>% 
  st_transform(2253) %>% st_buffer(distance)

tallies_with_earlier_property_crime <- tallies_with_earlier_property_crime %>%
  st_join(earlier_crime_sf %>% filter(general_type == "Property Crime") %>% st_transform(2253),
          join = st_contains)

#count the number of recent recorded crimes within 200 meters, noting that every parcel has at least one
tallies_with_earlier_property_crime <- tallies_with_earlier_property_crime %>% as.data.frame() %>% select(-geometry) %>% 
  group_by(parcelnum) %>% summarize(earlier_property_crime_incidents = n())

#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_earlier_violent_crime <- improve_detroit_and_violations_tallies %>% 
  st_transform(2253) %>% st_buffer(distance)

tallies_with_earlier_violent_crime <- tallies_with_earlier_violent_crime %>%
  st_join(earlier_crime_sf %>% filter(general_type == "Violent Crime") %>% st_transform(2253),
          join = st_contains)

#count the number of recent recorded crimes within 200 meters, noting that every parcel has at least one
tallies_with_earlier_violent_crime <- tallies_with_earlier_violent_crime %>% 
  as.data.frame() %>% select(-geometry) %>% 
  group_by(parcelnum) %>% summarize(earlier_violent_crime_incidents = n())

#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_earlier_nuiscance_crime <- improve_detroit_and_violations_tallies %>% 
  st_transform(2253) %>% st_buffer(distance)

tallies_with_earlier_nuiscance_crime <- tallies_with_earlier_nuiscance_crime %>%
  st_join(earlier_crime_sf %>% filter(general_type == "Nuiscance Offences") %>% st_transform(2253),
          join = st_contains)

#count the number of recent recorded crimes within 200 meters, noting that every parcel has at least one
tallies_with_earlier_nuiscance_crime <- tallies_with_earlier_nuiscance_crime %>% 
  as.data.frame() %>% select(-geometry) %>% 
  group_by(parcelnum) %>% summarize(earlier_nuiscance_offences = n())


tallies_with_crime <- tallies_with_later_crime %>%
  inner_join(improve_detroit_and_violations_tallies, by = "parcelnum") %>%
  inner_join(tallies_with_earlier_property_crime, by = "parcelnum") %>%
  inner_join(tallies_with_earlier_violent_crime, by = "parcelnum") %>%
  inner_join(tallies_with_earlier_nuiscance_crime, by = "parcelnum")

#rm(tallies_with_earlier_property_crime, tallies_with_later_crime, tallies_with_earlier_violent_crime)
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_crime %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#n the number of reported crimtes
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + earlier_nuiscance_offences

models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6457068
sd(as.numeric(accuracies))
## [1] 0.02623764
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 116  74
##   TRUE   67 107
##        truth
## pred      0   1
##   FALSE 119  49
##   TRUE   62 134
##        truth
## pred      0   1
##   FALSE 132  70
##   TRUE   66  97
##        truth
## pred      0   1
##   FALSE 132  67
##   TRUE   60 105
##        truth
## pred      0   1
##   FALSE 124  60
##   TRUE   62 118
##        truth
## pred      0   1
##   FALSE 117  70
##   TRUE   60 117
##        truth
## pred      0   1
##   FALSE 129  67
##   TRUE   71  97
##        truth
## pred      0   1
##   FALSE 113  63
##   TRUE   76 112
##        truth
## pred      0   1
##   FALSE 115  77
##   TRUE   47 125
##        truth
## pred      0   1
##   FALSE 123  52
##   TRUE   70 119
models[4]
## [[1]]
## 
## Call:  glm(formula = formula, family = "binomial", data = train[-k_folds[[.x]], 
##     ])
## 
## Coefficients:
##                      (Intercept)             total_fines_by_parcel  
##                        0.4616433                         0.0002497  
##   number_of_violations_by_parcel            improve_issues_tallies  
##                        0.0598077                         0.0008865  
## earlier_property_crime_incidents   earlier_violent_crime_incidents  
##                       -0.0160338                         0.0100555  
##       earlier_nuiscance_offences  
##                        0.0746626  
## 
## Degrees of Freedom: 3276 Total (i.e. Null);  3270 Residual
## Null Deviance:       4542 
## Residual Deviance: 4200  AIC: 4214
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_crime %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#use the same formula as in the logistic model above
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6272972
sd(as.numeric(accuracies))
## [1] 0.02379004
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 117  72
##   TRUE   69 106
##        truth
## pred      0   1
##   FALSE 106  49
##   TRUE   80 129
##        truth
## pred      0   1
##   FALSE 116  61
##   TRUE   71 117
##        truth
## pred      0   1
##   FALSE 105  68
##   TRUE   81 110
##        truth
## pred      0   1
##   FALSE 105  48
##   TRUE   81 130
##        truth
## pred      0   1
##   FALSE 105  54
##   TRUE   81 124
##        truth
## pred      0   1
##   FALSE 124  58
##   TRUE   62 120
##        truth
## pred      0   1
##   FALSE  96  48
##   TRUE   90 130
##        truth
## pred      0   1
##   FALSE 102  54
##   TRUE   84 124
##        truth
## pred      0   1
##   FALSE 106  66
##   TRUE   80 112
models[[10]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.5111382 0.4888618)  
##    2) total_fines_by_parcel< 25 2206  926 0 (0.5802357 0.4197643)  
##      4) earlier_property_crime_incidents>=166.5 1143  368 0 (0.6780402 0.3219598) *
##      5) earlier_property_crime_incidents< 166.5 1063  505 1 (0.4750706 0.5249294)  
##       10) earlier_violent_crime_incidents< 180.5 846  415 0 (0.5094563 0.4905437)  
##         20) earlier_property_crime_incidents>=118.5 364  131 0 (0.6401099 0.3598901) *
##         21) earlier_property_crime_incidents< 118.5 482  198 1 (0.4107884 0.5892116) *
##       11) earlier_violent_crime_incidents>=180.5 217   74 1 (0.3410138 0.6589862) *
##    3) total_fines_by_parcel>=25 1071  395 1 (0.3688142 0.6311858) *
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_crime %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#Random Forest, using the same formula as in the logistic and rpart models (using 500 decision trees)
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6580581
sd(as.numeric(accuracies))
## [1] 0.02161093
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 138  77
##   TRUE   48 101
##        truth
## pred      0   1
##   FALSE 126  63
##   TRUE   60 115
##        truth
## pred      0   1
##   FALSE 133  67
##   TRUE   54 111
##        truth
## pred      0   1
##   FALSE 116  72
##   TRUE   70 106
##        truth
## pred      0   1
##   FALSE 139  77
##   TRUE   47 101
##        truth
## pred      0   1
##   FALSE 134  69
##   TRUE   52 109
##        truth
## pred      0   1
##   FALSE 135  61
##   TRUE   51 117
##        truth
## pred      0   1
##   FALSE 132  66
##   TRUE   54 112
##        truth
## pred      0   1
##   FALSE 134  79
##   TRUE   52  99
##        truth
## pred      0   1
##   FALSE 133  73
##   TRUE   53 105

Now add some constant properties of the parcels to the models, to see if they improve the model fit.

#add the parcel area, to see if it improves
tallies_with_area <- tallies_with_crime %>% 
  left_join(parcel_sf %>% as.data.frame() %>% select(parcelnum, total_acre), 
            by = "parcelnum") %>% select(-geometry)
  
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_area %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#n the number of reported crimtes
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + earlier_nuiscance_offences

models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.645983
sd(as.numeric(accuracies))
## [1] 0.02557795
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 118  74
##   TRUE   65 107
##        truth
## pred      0   1
##   FALSE 119  49
##   TRUE   62 134
##        truth
## pred      0   1
##   FALSE 130  70
##   TRUE   68  97
##        truth
## pred      0   1
##   FALSE 133  67
##   TRUE   59 105
##        truth
## pred      0   1
##   FALSE 124  60
##   TRUE   62 118
##        truth
## pred      0   1
##   FALSE 117  70
##   TRUE   60 117
##        truth
## pred      0   1
##   FALSE 129  67
##   TRUE   71  97
##        truth
## pred      0   1
##   FALSE 113  62
##   TRUE   76 113
##        truth
## pred      0   1
##   FALSE 116  77
##   TRUE   46 125
##        truth
## pred      0   1
##   FALSE 120  51
##   TRUE   73 120

Try and test an rpart model

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_area %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#use the same formula as with the glm model above
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6305916
sd(as.numeric(accuracies))
## [1] 0.02164158
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 118  70
##   TRUE   68 108
##        truth
## pred      0   1
##   FALSE 106  47
##   TRUE   80 131
##        truth
## pred      0   1
##   FALSE 112  54
##   TRUE   75 124
##        truth
## pred      0   1
##   FALSE 111  71
##   TRUE   75 107
##        truth
## pred      0   1
##   FALSE 118  62
##   TRUE   68 116
##        truth
## pred      0   1
##   FALSE 113  54
##   TRUE   73 124
##        truth
## pred      0   1
##   FALSE 120  61
##   TRUE   66 117
##        truth
## pred      0   1
##   FALSE 101  52
##   TRUE   85 126
##        truth
## pred      0   1
##   FALSE 128  78
##   TRUE   58 100
##        truth
## pred      0   1
##   FALSE 115  77
##   TRUE   71 101
models[[9]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.5111382 0.4888618)  
##    2) total_fines_by_parcel< 35 2220  940 0 (0.5765766 0.4234234)  
##      4) earlier_property_crime_incidents>=121.5 1675  611 0 (0.6352239 0.3647761) *
##      5) earlier_property_crime_incidents< 121.5 545  216 1 (0.3963303 0.6036697)  
##       10) total_acre>=0.1225 132   48 0 (0.6363636 0.3636364) *
##       11) total_acre< 0.1225 413  132 1 (0.3196126 0.6803874) *
##    3) total_fines_by_parcel>=35 1057  395 1 (0.3736991 0.6263009) *

Try and test a random forest model.

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_area %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

#use the same formula as with the glm and rpart models above
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6712336
sd(as.numeric(accuracies))
## [1] 0.03294328
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 136  67
##   TRUE   50 111
##        truth
## pred      0   1
##   FALSE 123  58
##   TRUE   63 120
##        truth
## pred      0   1
##   FALSE 140  59
##   TRUE   47 119
##        truth
## pred      0   1
##   FALSE 117  64
##   TRUE   69 114
##        truth
## pred      0   1
##   FALSE 135  72
##   TRUE   51 106
##        truth
## pred      0   1
##   FALSE 135  58
##   TRUE   51 120
##        truth
## pred      0   1
##   FALSE 137  62
##   TRUE   49 116
##        truth
## pred      0   1
##   FALSE 142  62
##   TRUE   44 116
##        truth
## pred      0   1
##   FALSE 126  72
##   TRUE   60 106
##        truth
## pred      0   1
##   FALSE 126  79
##   TRUE   60  99
#add the parcel area, to see if it improves
tallies_with_district <- tallies_with_area %>% 
  left_join(parcel_sf %>% as.data.frame() %>% select(parcelnum, council_di), 
            by = "parcelnum")
  
tallies_with_district$council_di <- as.character(tallies_with_district$council_di)

district_recording_errors <- tallies_with_district %>% 
  filter(! council_di %in% c("01", "02", "03", "04", "05", "06", "07"))

district_recording_errors <- parcels_with_labels %>% select(parcelnum) %>%
  inner_join(district_recording_errors, by = "parcelnum")

district_recording_errors <- district_recording_errors %>% 
  st_join(city_council_districts %>% select(districts), join = st_within)
## although coordinates are longitude/latitude, st_within assumes that they are planar
district_recording_errors <- district_recording_errors %>%
  mutate(council_di = districts) %>% as.data.frame() %>% 
  select(-districts, -geometry) %>% as.tibble()

tallies_with_district <- tallies_with_district %>%
  filter(council_di %in% c("01", "02", "03", "04", "05", "06", "07")) %>%
  rbind(district_recording_errors)

#with the obsevation that, for example, district 7 is currently represented by both of the strings "07" and "7", we make the representation constant
tallies_with_district <- tallies_with_district %>%
  mutate(council_di = as.integer(council_di)) %>%
  mutate(council_di = as.factor(council_di))

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_district %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + earlier_nuiscance_offences

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6369028
sd(as.numeric(accuracies))
## [1] 0.02471345
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 103  44
##   TRUE   83 134
##        truth
## pred      0   1
##   FALSE 105  42
##   TRUE   81 136
##        truth
## pred      0   1
##   FALSE 126  58
##   TRUE   61 120
##        truth
## pred      0   1
##   FALSE 107  59
##   TRUE   79 119
##        truth
## pred      0   1
##   FALSE 126  74
##   TRUE   60 104
##        truth
## pred      0   1
##   FALSE 112  76
##   TRUE   74 102
##        truth
## pred      0   1
##   FALSE 110  56
##   TRUE   76 122
##        truth
## pred      0   1
##   FALSE 130  70
##   TRUE   56 108
##        truth
## pred      0   1
##   FALSE 109  58
##   TRUE   77 120
##        truth
## pred      0   1
##   FALSE 110  62
##   TRUE   76 116
models[[1]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.5111382 0.4888618)  
##    2) total_fines_by_parcel< 35 2228  942 0 (0.5771993 0.4228007)  
##      4) total_acre>=0.1065 1067  338 0 (0.6832240 0.3167760) *
##      5) total_acre< 0.1065 1161  557 1 (0.4797588 0.5202412)  
##       10) earlier_property_crime_incidents>=164.5 576  232 0 (0.5972222 0.4027778)  
##         20) earlier_violent_crime_incidents< 264.5 350  108 0 (0.6914286 0.3085714) *
##         21) earlier_violent_crime_incidents>=264.5 226  102 1 (0.4513274 0.5486726) *
##       11) earlier_property_crime_incidents< 164.5 585  213 1 (0.3641026 0.6358974) *
##    3) total_fines_by_parcel>=35 1049  389 1 (0.3708294 0.6291706) *

Now try random forest, with the dataset and formula above.

#same formula as in rpart model above
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.686616
sd(as.numeric(accuracies))
## [1] 0.02196345
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 132  53
##   TRUE   54 125
##        truth
## pred      0   1
##   FALSE 132  68
##   TRUE   54 110
##        truth
## pred      0   1
##   FALSE 147  63
##   TRUE   40 115
##        truth
## pred      0   1
##   FALSE 129  56
##   TRUE   57 122
##        truth
## pred      0   1
##   FALSE 135  63
##   TRUE   51 115
##        truth
## pred      0   1
##   FALSE 133  67
##   TRUE   53 111
##        truth
## pred      0   1
##   FALSE 129  61
##   TRUE   57 117
##        truth
## pred      0   1
##   FALSE 140  55
##   TRUE   46 123
##        truth
## pred      0   1
##   FALSE 123  58
##   TRUE   63 120
##        truth
## pred      0   1
##   FALSE 131  67
##   TRUE   55 111

Now add the frontage (from parcel data). I wouldn’t expect this to add very much to the value of the model (given that we have already incuded area), but let’s see.

tallies_with_frontage <- tallies_with_district %>% 
  left_join(parcel_sf %>% as.data.frame() %>% select(parcelnum, frontage), by = "parcelnum")

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_frontage %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + frontage + earlier_nuiscance_offences

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6498043
sd(as.numeric(accuracies))
## [1] 0.03416033
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 103  49
##   TRUE   83 129
##        truth
## pred      0   1
##   FALSE 123  53
##   TRUE   63 125
##        truth
## pred      0   1
##   FALSE 132  50
##   TRUE   55 128
##        truth
## pred      0   1
##   FALSE 105  61
##   TRUE   81 117
##        truth
## pred      0   1
##   FALSE 116  54
##   TRUE   70 124
##        truth
## pred      0   1
##   FALSE 118  65
##   TRUE   68 113
##        truth
## pred      0   1
##   FALSE 119  62
##   TRUE   67 116
##        truth
## pred      0   1
##   FALSE 124  55
##   TRUE   62 123
##        truth
## pred      0   1
##   FALSE 106  51
##   TRUE   80 127
##        truth
## pred      0   1
##   FALSE 104  64
##   TRUE   82 114
models[[2]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.5111382 0.4888618)  
##    2) total_fines_by_parcel< 35 2220  942 0 (0.5756757 0.4243243)  
##      4) frontage>=35.5 1296  423 0 (0.6736111 0.3263889) *
##      5) frontage< 35.5 924  405 1 (0.4383117 0.5616883)  
##       10) earlier_property_crime_incidents>=133.5 550  256 0 (0.5345455 0.4654545)  
##         20) council_di=3,6,7 262   96 0 (0.6335878 0.3664122) *
##         21) council_di=1,2,4,5 288  128 1 (0.4444444 0.5555556)  
##           42) earlier_property_crime_incidents>=179.5 168   76 0 (0.5476190 0.4523810)  
##             84) earlier_violent_crime_incidents< 260.5 92   26 0 (0.7173913 0.2826087) *
##             85) earlier_violent_crime_incidents>=260.5 76   26 1 (0.3421053 0.6578947) *
##           43) earlier_property_crime_incidents< 179.5 120   36 1 (0.3000000 0.7000000) *
##       11) earlier_property_crime_incidents< 133.5 374  111 1 (0.2967914 0.7032086) *
##    3) total_fines_by_parcel>=35 1057  397 1 (0.3755913 0.6244087) *

Random forest again:

#same formula as in rpart model above
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6855163
sd(as.numeric(accuracies))
## [1] 0.02510488
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 124  51
##   TRUE   62 127
##        truth
## pred      0   1
##   FALSE 122  66
##   TRUE   64 112
##        truth
## pred      0   1
##   FALSE 138  53
##   TRUE   49 125
##        truth
## pred      0   1
##   FALSE 121  53
##   TRUE   65 125
##        truth
## pred      0   1
##   FALSE 131  55
##   TRUE   55 123
##        truth
## pred      0   1
##   FALSE 132  58
##   TRUE   54 120
##        truth
## pred      0   1
##   FALSE 127  53
##   TRUE   59 125
##        truth
## pred      0   1
##   FALSE 131  47
##   TRUE   55 131
##        truth
## pred      0   1
##   FALSE 114  51
##   TRUE   72 127
##        truth
## pred      0   1
##   FALSE 123  60
##   TRUE   63 118

We look at the effect of parcels in the region with no buildings on them. Many of these will be lots on which buildings have been dismantled. (However, could some of them be public parks or the like?)

distance <- set_units(200, "m")  #for crimes within 200 meters

vacant_parcel_buffers <- parcel_sf %>% select(parcelnum) %>% 
  inner_join(tallies_with_frontage, by = "parcelnum") %>%
  st_transform(2253) %>% st_buffer(distance)
  
vacant_parcel_join <- vacant_parcel_buffers %>% 
  st_join(parcel_sf %>% st_transform(2253) %>% filter(num_buildi == 0) %>% select(parcelnum), 
          join = st_contains) %>%
  filter(parcelnum.x != parcelnum.y) %>% 
  group_by(parcelnum.x) %>% summarize(num_vacant_parcels = n())

tallies_with_vacant_parcels <- tallies_with_frontage %>% 
  left_join(vacant_parcel_join %>% as.data.frame() %>% select(-geometry),
            by = c("parcelnum" = "parcelnum.x"))

tallies_with_vacant_parcels <- tallies_with_vacant_parcels %>%
  mutate(num_vacant_parcels = ifelse(is.na(num_vacant_parcels), 0, num_vacant_parcels))

Model with rpart:

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_vacant_parcels %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)


formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
  earlier_nuiscance_offences

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6772859
sd(as.numeric(accuracies))
## [1] 0.02237345
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 122  55
##   TRUE   64 123
##        truth
## pred      0   1
##   FALSE 120  61
##   TRUE   66 117
##        truth
## pred      0   1
##   FALSE 119  49
##   TRUE   68 129
##        truth
## pred      0   1
##   FALSE  96  39
##   TRUE   90 139
##        truth
## pred      0   1
##   FALSE 123  37
##   TRUE   63 141
##        truth
## pred      0   1
##   FALSE 112  47
##   TRUE   74 131
##        truth
## pred      0   1
##   FALSE 119  52
##   TRUE   67 126
##        truth
## pred      0   1
##   FALSE 110  42
##   TRUE   76 136
##        truth
## pred      0   1
##   FALSE 114  40
##   TRUE   72 138
##        truth
## pred      0   1
##   FALSE 124  51
##   TRUE   62 127
models[[7]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 3277 1602 0 (0.51113824 0.48886176)  
##    2) num_vacant_parcels< 41.5 1672  525 0 (0.68600478 0.31399522)  
##      4) total_fines_by_parcel< 35 1139  261 0 (0.77085162 0.22914838) *
##      5) total_fines_by_parcel>=35 533  264 0 (0.50469043 0.49530957)  
##       10) frontage>=79 22    2 0 (0.90909091 0.09090909) *
##       11) frontage< 79 511  249 1 (0.48727984 0.51272016)  
##         22) total_fines_by_parcel< 512.5 262  113 0 (0.56870229 0.43129771) *
##         23) total_fines_by_parcel>=512.5 249  100 1 (0.40160643 0.59839357) *
##    3) num_vacant_parcels>=41.5 1605  528 1 (0.32897196 0.67102804) *

Now try random forest

models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7061185
sd(as.numeric(accuracies))
## [1] 0.0239571
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 119  48
##   TRUE   67 130
##        truth
## pred      0   1
##   FALSE 127  63
##   TRUE   59 115
##        truth
## pred      0   1
##   FALSE 137  49
##   TRUE   50 129
##        truth
## pred      0   1
##   FALSE 124  46
##   TRUE   62 132
##        truth
## pred      0   1
##   FALSE 135  45
##   TRUE   51 133
##        truth
## pred      0   1
##   FALSE 130  53
##   TRUE   56 125
##        truth
## pred      0   1
##   FALSE 131  56
##   TRUE   55 122
##        truth
## pred      0   1
##   FALSE 135  46
##   TRUE   51 132
##        truth
## pred      0   1
##   FALSE 128  42
##   TRUE   58 136
##        truth
## pred      0   1
##   FALSE 132  59
##   TRUE   54 119

With the idea that blighted buildings may tend to come in clusters, we may guess that blightedness in one buidling may indicate that nearby buildings are more likely to become blighted. We thus, for each parcel we are using for training and testing, count the number of buildings that became blighted before June of 2016, before the time over which we are attempting carry out our predictions.

library(lubridate)

#identify the earlier completed demolotions and put the results into a form that can be combined with the dismantle permits results
earlier_completed_demolitions_sf <- completed_demolitions_sf %>% 
  mutate(Date = `Demolition Date`, parcelnum = `Parcel ID`) %>%
  select(Date, parcelnum) %>% filter(Date < mdy("06/01/2016"))

#repeat for the dismantle permit results
earlier_dismantle_permits_sf <- dismantle_permits_sf %>%
  mutate(Date = `Permit Issued`, parcelnum = `Parcel Number`) %>%
  select(Date, parcelnum) %>% filter(Date < mdy("06/01/2016"))

#combine the above results, and ensure that parcels being counted are not the parcels that we are using for our modeling dataset
earlier_blighted_parcels <- rbind(earlier_completed_demolitions_sf, earlier_dismantle_permits_sf) %>%
  as.data.frame() %>% select(-geometry) %>% distinct(parcelnum) %>% 
  filter(! parcelnum %in% tallies_with_vacant_parcels$parcelnum)

#add the parcel geometry to the set of earlier blighted parcels
earlier_blighted_parcels <- parcel_sf %>% select(parcelnum) %>% 
  inner_join(earlier_blighted_parcels, by = "parcelnum")

#set a distance of 100 meters
distance <- set_units(100, "m") #for setting a buffer of 100 meter around each relevant parcel

#expand the geometry for each parcel by 100 meters
buffered_parcels <- parcel_sf %>% select(parcelnum) %>%
  inner_join(tallies_with_vacant_parcels, by = "parcelnum") %>%
  st_transform(2253) %>% st_buffer(distance)

#count the numbers of blighted parcels within the expanded geometry (excluding the parcel for which we are counting)
tallies_of_earlier_blighted_parcels <- buffered_parcels %>% select(parcelnum) %>%
  st_join(earlier_blighted_parcels %>% st_transform(2253), 
          join = st_contains, left = FALSE) %>% filter(parcelnum.x != parcelnum.y)

tallies_of_earlier_blighted_parcels <- tallies_of_earlier_blighted_parcels %>%
  as.data.frame() %>% select(-geometry) %>%
  group_by(parcelnum.x) %>% summarize(num_nearby_blighted_parcels = n())

#our modeling dataset, now with the new quantity
tallies_with_earlier_blighted_parcels <- tallies_with_vacant_parcels %>%
  left_join(tallies_of_earlier_blighted_parcels, by = c("parcelnum" = "parcelnum.x")) %>%
  mutate(num_nearby_blighted_parcels = ifelse(is.na(num_nearby_blighted_parcels), 0,
                                              num_nearby_blighted_parcels))

rm(earlier_completed_demolitions_sf, earlier_dismantle_permits_sf, buffered_parcels, earlier_blighted_parcels, 
   tallies_of_earlier_blighted_parcels)

We now try an rpart model, now including the earlier demolition data.

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_earlier_blighted_parcels %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)

formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
  num_nearby_blighted_parcels + earlier_nuiscance_offences

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6970548
sd(as.numeric(accuracies))
## [1] 0.02505895
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 110  38
##   TRUE   76 140
##        truth
## pred      0   1
##   FALSE 107  42
##   TRUE   79 136
##        truth
## pred      0   1
##   FALSE 119  34
##   TRUE   68 144
##        truth
## pred      0   1
##   FALSE 130  49
##   TRUE   56 129
##        truth
## pred      0   1
##   FALSE 126  31
##   TRUE   60 147
##        truth
## pred      0   1
##   FALSE 129  55
##   TRUE   57 123
##        truth
## pred      0   1
##   FALSE 103  34
##   TRUE   83 144
##        truth
## pred      0   1
##   FALSE 102  36
##   TRUE   84 142
##        truth
## pred      0   1
##   FALSE 107  30
##   TRUE   79 148
##        truth
## pred      0   1
##   FALSE 107  33
##   TRUE   79 145
models[[7]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 3277 1602 0 (0.5111382 0.4888618)  
##   2) num_nearby_blighted_parcels< 1.5 1561  456 0 (0.7078796 0.2921204)  
##     4) num_vacant_parcels< 50.5 1310  307 0 (0.7656489 0.2343511) *
##     5) num_vacant_parcels>=50.5 251  102 1 (0.4063745 0.5936255) *
##   3) num_nearby_blighted_parcels>=1.5 1716  570 1 (0.3321678 0.6678322) *

Now try random forest:

models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7203952
sd(as.numeric(accuracies))
## [1] 0.02796726
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 125  45
##   TRUE   61 133
##        truth
## pred      0   1
##   FALSE 132  60
##   TRUE   54 118
##        truth
## pred      0   1
##   FALSE 142  42
##   TRUE   45 136
##        truth
## pred      0   1
##   FALSE 126  38
##   TRUE   60 140
##        truth
## pred      0   1
##   FALSE 140  47
##   TRUE   46 131
##        truth
## pred      0   1
##   FALSE 136  45
##   TRUE   50 133
##        truth
## pred      0   1
##   FALSE 128  56
##   TRUE   58 122
##        truth
## pred      0   1
##   FALSE 120  45
##   TRUE   66 133
##        truth
## pred      0   1
##   FALSE 133  38
##   TRUE   53 140
##        truth
## pred      0   1
##   FALSE 126  49
##   TRUE   60 129

Now we add the fines for blight violations for nearby parcels.

parcels <- parcel_sf %>% select(parcelnum) %>% 
  inner_join(tallies_with_earlier_blighted_parcels %>% select(parcelnum), 
             by = "parcelnum")

distance <- distance <- set_units(100, "m")

buffered_parcels <- parcels %>% st_transform(2253) %>% st_buffer(distance)

blight_violation_points_in_buffer <- blight_violations_sf %>% 
  st_transform(2253) %>% st_join(buffered_parcels, join = st_within) %>%
  filter(`Violation Parcel ID` != parcelnum)

violations_at_neighboring_parcels <- blight_violation_points_in_buffer %>% 
  group_by(parcelnum) %>% 
  summarize(num_violations_nearby_parcels = n(),
            sum_fines_nearby_parcels = sum(as.numeric(str_sub(`Fine Amount`, start = 2)), na.rm = TRUE))

tallies_with_nearby_violations <- tallies_with_earlier_blighted_parcels %>%
  left_join(violations_at_neighboring_parcels %>% as.data.frame() %>% select(-geometry),
            by = "parcelnum")

tallies_with_nearby_violations <- tallies_with_nearby_violations %>%
  mutate(sum_fines_nearby_parcels = 
           ifelse(is.na(sum_fines_nearby_parcels), 0, sum_fines_nearby_parcels),
         num_violations_nearby_parcels = 
           ifelse(is.na(num_violations_nearby_parcels), 0, num_violations_nearby_parcels))

We use this dataset to construct an rpart model:

#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_nearby_violations %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)

#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(515)
k_folds <- caret::createFolds(train$blighted)

formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + 
  improve_issues_tallies + earlier_property_crime_incidents + 
  earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
  num_nearby_blighted_parcels + num_violations_nearby_parcels + earlier_nuiscance_offences

models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6915663
sd(as.numeric(accuracies))
## [1] 0.02177809
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 107  30
##   TRUE   80 148
##        truth
## pred      0   1
##   FALSE 108  34
##   TRUE   78 144
##        truth
## pred      0   1
##   FALSE 101  38
##   TRUE   85 140
##        truth
## pred      0   1
##   FALSE 127  55
##   TRUE   59 123
##        truth
## pred      0   1
##   FALSE 112  40
##   TRUE   74 138
##        truth
## pred      0   1
##   FALSE 122  53
##   TRUE   64 125
##        truth
## pred      0   1
##   FALSE 123  34
##   TRUE   63 144
##        truth
## pred      0   1
##   FALSE 110  27
##   TRUE   76 151
##        truth
## pred      0   1
##   FALSE 131  67
##   TRUE   55 111
##        truth
## pred      0   1
##   FALSE 110  35
##   TRUE   76 143
models[[3]]
## n= 3277 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 3277 1602 0 (0.5111382 0.4888618)  
##   2) num_nearby_blighted_parcels< 1.5 1557  456 0 (0.7071291 0.2928709)  
##     4) num_vacant_parcels< 51.5 1316  307 0 (0.7667173 0.2332827) *
##     5) num_vacant_parcels>=51.5 241   92 1 (0.3817427 0.6182573) *
##   3) num_nearby_blighted_parcels>=1.5 1720  574 1 (0.3337209 0.6662791) *
models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))

predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
                                      type = "response"))
accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))


#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7058453
sd(as.numeric(accuracies))
## [1] 0.01969066
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 142  56
##   TRUE   45 122
##        truth
## pred      0   1
##   FALSE 140  65
##   TRUE   46 113
##        truth
## pred      0   1
##   FALSE 131  58
##   TRUE   55 120
##        truth
## pred      0   1
##   FALSE 138  60
##   TRUE   48 118
##        truth
## pred      0   1
##   FALSE 131  56
##   TRUE   55 122
##        truth
## pred      0   1
##   FALSE 142  61
##   TRUE   44 117
##        truth
## pred      0   1
##   FALSE 152  64
##   TRUE   34 114
##        truth
## pred      0   1
##   FALSE 145  54
##   TRUE   41 124
##        truth
## pred      0   1
##   FALSE 135  66
##   TRUE   51 112
##        truth
## pred      0   1
##   FALSE 140  66
##   TRUE   46 112
models[[9]]
## 
## Call:  glm(formula = formula, family = "binomial", data = train[-k_folds[[.x]], 
##     ])
## 
## Coefficients:
##                      (Intercept)             total_fines_by_parcel  
##                        -0.862438                          0.000235  
##   number_of_violations_by_parcel            improve_issues_tallies  
##                         0.110902                          0.001646  
## earlier_property_crime_incidents   earlier_violent_crime_incidents  
##                        -0.007130                          0.005644  
##                       total_acre                       council_di2  
##                         0.088485                         -0.235041  
##                      council_di3                       council_di4  
##                        -0.133384                          0.020426  
##                      council_di5                       council_di6  
##                        -0.572968                         -0.343677  
##                      council_di7                          frontage  
##                        -0.193995                         -0.005878  
##               num_vacant_parcels       num_nearby_blighted_parcels  
##                         0.014407                          0.174176  
##    num_violations_nearby_parcels        earlier_nuiscance_offences  
##                        -0.004899                          0.026017  
## 
## Degrees of Freedom: 3276 Total (i.e. Null);  3259 Residual
## Null Deviance:       4541 
## Residual Deviance: 3682  AIC: 3718

Now try random forest:

models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))

predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)

accuracies <- 1:10 %>% 
  map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))

#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.73112
sd(as.numeric(accuracies))
## [1] 0.01820602
#confusion matrices for the models
for (index in 1:10) {
  print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
##        truth
## pred      0   1
##   FALSE 129  43
##   TRUE   58 135
##        truth
## pred      0   1
##   FALSE 130  46
##   TRUE   56 132
##        truth
## pred      0   1
##   FALSE 128  46
##   TRUE   58 132
##        truth
## pred      0   1
##   FALSE 135  41
##   TRUE   51 137
##        truth
## pred      0   1
##   FALSE 127  34
##   TRUE   59 144
##        truth
## pred      0   1
##   FALSE 131  48
##   TRUE   55 130
##        truth
## pred      0   1
##   FALSE 134  44
##   TRUE   52 134
##        truth
## pred      0   1
##   FALSE 139  37
##   TRUE   47 141
##        truth
## pred      0   1
##   FALSE 133  52
##   TRUE   53 126
##        truth
## pred      0   1
##   FALSE 133  46
##   TRUE   53 132
models[[3]]
## 
## Call:
##  randomForest(formula = formula, data = train[-k_folds[[.x]],      ]) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 27.1%
## Confusion matrix:
##      0    1 class.error
## 0 1181  494   0.2949254
## 1  394 1208   0.2459426

Save to file file we are using for modeling to disk, for use in other scripts

tallies_with_vagrancy_at_property <- tallies_with_nearby_violations %>% 
  inner_join(parcel_sf %>% select(parcelnum, geometry), by = "parcelnum") %>%
  st_sf() %>% st_join(earlier_crime_sf %>% filter(CATEGORY == "VAGRANCY (OTHER)"), 
                      join = st_contains, left = FALSE) %>% 
  group_by(parcelnum) %>% summarize(arson_incidents_at_property = n())

rm(tallies_with_vagrancy_at_property, tallies_with_arson_at_property)
write_rds(tallies_with_nearby_violations, "calculated_tallies.rds")

write_rds(training_parcelnums, "training_parcelnums.rds")