In this project, I will attempt to predict whether, for any building within a set of identified buildings in Detroit, whether that building will be be targeted for demolition. Potential predictors include citations related to the building, crime, and complaints concerning the building related to blight. I have more-or-less completed the data-cleaning, and I am in the process of associating the various predictors with the buildings. So far I am using about 3GB of data downloaded from https://data.detroitmi.gov/
.
library(tidyverse)
library(sf)
library(ggmap)
#recorded violations associated with blight (e.g. unkempt properties)
blight_violations <- read_csv("./data/Blight_Violations_3_19_2018.csv",
guess_max = 10^6)
#read the downloaded file for all the building permits and then filter out the permits for dismantling
dismantle_permits <- read_csv("./data/Building_Permits_3_19_2018.csv",
guess_max = 10^6) %>%
filter(`Building Permit Type` == "Dismantle")
#the files that contain the crime data
crime_to_12062016 <-
read_csv("./data/DPD__All_Crime_Incidents__January_1__2009_-_December_6__2016.csv",
guess_max = 10^6)
crime_12062016_to_03192018 <-
read_csv("./data/DPD__All_Crime_Incidents__December_6__2016_-_3_19_2018.csv",
guess_max = 10^6)
#the 311 system
improve_detroit_issues <- read_csv("./data/Improve_Detroit_Issues_3_19_2018.csv",
guess_max = 10^6)
#another file with demolition information downloaded 4/4/2017. I should note that this file appears to cover a somewhat different domain of cases than does dismantle_permits. The difference appears to be that completed_demolitions covers just those buildings that have been dismantled under the Detroit Demolition Program. It may thus be the case that this data omits cases of buildings demolished not because of blight but to make room for something else.
completed_demolitions <- read_csv("./data/Detroit_Demolitions.csv",
guess_max = 10^6)
upcoming_demolitions <- read_csv("./data/Upcoming_Demolitions.csv")
#the shapefile representing Detroit parcels, read as an sf data frame
parcel_sf <- st_read("./data/Parcel Map")
#convert the parcel number column to a character vector, to match `Parcel Number` in the dismantle permits data
parcel_sf <- parcel_sf %>% mutate(parcelnum = as.character(parcelnum))
#downloaded from http://d3-d3.opendata.arcgis.com/datasets/383eb730952e470389f09617b5448026_0 on 04/13/2018: "Harded Hit Fund Areas, with Expansion"
Hardest_Hit_Fund_Areas <- st_read("./data/Hardest_Hit_Fund_Areas_with_Expansion.kml",
crs = st_crs(parcel_sf))
city_council_districts <- st_read("./data/City Council Districts")
For all of the downloaded data sets other than the parcels dataset, we extract the usable latitude and longitude values and then use this information to form simple features (sf) objects. Rows with obviously incorrect values, or values that would represent positions well outside Detroit, are filtered out, together with rows for which the latitude or longitude data is missing.
#function for converting the position (character) column into a column of points in the simple features (sf) framework.
add_sf_point <- function(df, column) {
#extract the latitude and longitude from the string column that contains both. With the parentheses
#located from the end of the strings, it is possible to use the the same function for all five of
#the datasets for which we need to extract this information.
latitude <- str_sub(df[[column]],
stringi::stri_locate_last_fixed(df[[column]], "(")[,2] + 1,
stringi::stri_locate_last_fixed(df[[column]], ",")[,1] - 1)
longitude <- str_sub(df[[column]],
stringi::stri_locate_last_fixed(df[[column]], ", ")[,2] + 1,
stringi::stri_locate_last_fixed(df[[column]], ")")[,1] - 1)
#add the latititude and and longitude to a copy of the dataframe, filter out the NAs from
#these results, and then convert convert to sf, with point positions indicated in the
#geometry column
mutated <- df %>% mutate(extracted_lat = as.double(latitude),
extracted_lon = as.double(longitude))
#remove rows with NAs for latitude or longitude, or with values well outside of Detroit
filtered <- mutated %>%
filter(!is.na(extracted_lat) & !is.na(extracted_lon)) %>%
filter(41 < extracted_lat & extracted_lat < 44 & -85 < extracted_lon & extracted_lon < -81)
#create a dataframe from the items that have been filtered out
result_coord_na <- setdiff(mutated, filtered)
#create sf objects from the rows with usable latitude and longitude information
result_sf <- st_as_sf(filtered, coords = c("extracted_lon", "extracted_lat"),
crs = st_crs(parcel_sf))
return(list(result_sf, result_coord_na))
}
#apply the function to the seven datasets for which the data was not loaded as a simple features dataframe, thus producing a list of two dataframes for each of the datasets, the first element of the list a simple features data frame and the second element a dataframe with the instances for which it was not possible to convert to simple features
blight_violations_split <- add_sf_point(blight_violations, "Violation Location")
dismantle_permits_split <- add_sf_point(dismantle_permits, "Permit Location")
crime_to_12062016_split <- add_sf_point(crime_to_12062016, "LOCATION")
crime_12062016_to_03192018_split <- add_sf_point(crime_12062016_to_03192018, "Location")
improve_detroit_issues_split <- add_sf_point(improve_detroit_issues, "Location")
completed_demolitions_split <- add_sf_point(completed_demolitions, "Location")
upcoming_demolitions_split <- add_sf_point(upcoming_demolitions, "Location")
We now consider the data for which we do not yet have position data, and complete the information as well as we reasonably can, using the Google API and a function, geocode_pause
, that handles some of API’s quirks.
#the portino of the downloaded blight citations data, for which we do not have
blight_vio_na <- blight_violations_split[[2]]
#remove the rows for which geocoding is not likely to prodoce reliable results
useful <- blight_vio_na %>% filter(!is.na(`Violation Street Name`),
`Violation Street Number` > 0,
!is.na(`Violation Zip Code`))
#create a column of addresses to be used in geocoding
useful <- useful %>%
mutate(complete_address = paste(`Violation Street Number`, " ", `Violation Street Name`, ", ",
"Detroit, Michigan", " ", `Violation Zip Code`, sep = ""))
#function makes a maximum 6 attempts to geocode the given address using the Google API, with a pause of 1 second between attempts. We will use the function for the other datasets as well.
geocode_pause <- function(address) {
for (index in 1:6) {
Sys.sleep(1)
location <- ggmap::geocode(address)
if (!is.na(location$lon)) {
return(location)
}
}
}
#apply geocode_pause to each of the elements of the complete_addresse column and place the result in a new column, in which each entry is a data frame
useful <- useful %>% mutate(location = map(complete_address, geocode_pause))
#save to disc, to avoid avoid the need to geocode these addresses again when we rerun the analysis
write_rds(useful, "./data/blight_violations_geocodes.rds")
The geocoding has returned a data frame for each of the addresses. We thus need to unpack the elements of the location column, each of which is a data frame.
#read blight_violations_geocodes as a tibble
blight_violations_geocodes <- read_rds("./data/blight_violations_geocodes.rds")
#function for removing the instances for which geocoding failed (for which the value in the location column is NULLL). We will use this function for all of the geocoded data frames.
remove_null_locations <- function(df) {
#identify the rows for which the value in the location column is NULL
null_rows <- list()
for (index in 1:nrow(df)) {
if (is.null(df$location[[index]])) {
null_rows <- c(null_rows, index)
}
}
#remove the rows for which the value of the location column is NULL
df <- df[-as.integer(null_rows),]
}
blight_violations_geocodes <- remove_null_locations(blight_violations_geocodes)
#With blight_violations_geocodes a tibble, we can apply tidyr::unnest(), which will place the latitude and longitude in columns labelled "lat" and "lon".
blight_violations_geocodes <- blight_violations_geocodes %>% unnest(location)
#fill in the `Violation Latitute` and `Violation Longitude` data frames, which alread exist in the blight_violations data frame
blight_violations_geocodes <- blight_violations_geocodes %>%
mutate(`Violation Latitude` = lat,
`Violation Longitude` = lon)
#cut out some columns that have been added
blight_violations_geocodes <- blight_violations_geocodes %>%
select(-extracted_lat, -extracted_lon, -complete_address)
#put the position information into a simple features format (which will remove the "lat" and "lon" columns)
blight_violations_geocodes_sf <- st_as_sf(blight_violations_geocodes,
coords = c("lon", "lat"),
crs = st_crs(parcel_sf))
#combine the results with the previously generated sf data
blight_violations_sf <- rbind(blight_violations_split[[1]], blight_violations_geocodes_sf)
rm(blight_vio_na, blight_violations, blight_violations_geocodes,
blight_violations_geocodes_sf, blight_violations_split, useful)
#the dismantle permits for which position data (latitude and longitude) is missing
dismantle_permits_split_na <- dismantle_permits_split[[2]]
#remove the last two columns, which were not contained in the original dismantle_permits datastet
dismantle_permits_split_na <- dismantle_permits_split_na %>%
select(-extracted_lat, -extracted_lon)
#geocode the items in dismantle_permits_split_na, using the address column and the function geocode_pause, which makes a maximum of six attempts for each item. The result is list of dataframes in the location column.
dismantle_permits_split_geocode <- dismantle_permits_split_na %>%
mutate(location = map(str_c(`Site Address`, ", Detroit, Michigan"), geocode_pause))
#write the results of the geocoding to disk, to avoid having to repeat the geocoding when rerunning the analysis.
write_rds(dismantle_permits_split_geocode, "./data/dismantle_permits_geocodes.rds")
rm(dismantle_permits_split_na)
#load the geocoded data frame into R
dismantle_permits_split_geocode <- read_rds("./data/dismantle_permits_geocodes.rds")
#use remove_null_locations() to remove the rows for which geocoding failed and then parse the information in the dataframes in the location column into two new columns, lat and lan
dismantle_permits_split_geocode <-
remove_null_locations(dismantle_permits_split_geocode) %>%
unnest(location)
#convert to a simple features (sf) data frame, using the latititudes and longitudes
dismantle_permits_geocode_sf <- st_as_sf(dismantle_permits_split_geocode,
coords = c("lon", "lat"),
crs = st_crs(parcel_sf))
#append this simple features dataframe to the dataframe for which we already had usable positions
dismantle_permits_sf <- rbind(dismantle_permits_split[[1]], dismantle_permits_geocode_sf)
rm(dismantle_permits_split_geocode, dismantle_permits_geocode_sf, dismantle_permits, dismantle_permits_split, dismantle_permits_split_na)
We now fill-in the missing position information for the dataset for crimes up to 12-06-2016
#return to the older crime data
crime_to_12062016_leftovers <- crime_to_12062016_split[[2]]
#cut out the addresses that begin with "00"
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>%
filter(str_sub(LOCATION, 1, 2) != "00")
#filter out some obviously useless addresses, with few characters before the first "("
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>%
filter(!str_locate(LOCATION, "\\(")[,1] %in% 1:13)
#remove the two columns that were added earlier
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>%
select(-extracted_lat, -extracted_lon)
#create a column for use in geocoding
crime_to_12062016_leftovers <- crime_to_12062016_leftovers %>%
mutate(extracted_address = str_c(str_sub(LOCATION, 1,
str_locate(LOCATION, "\\(")[,1] - 2),
", Detroit, Michigan"))
#geocode the elements of extracted_address, using the function geocode_pause
crime_to_12062016_leftovers_geocode <- crime_to_12062016_leftovers %>%
mutate(location = map(extracted_address, geocode_pause))
#save the results, to avoid having to geocode again when rerunning the analysis
write_rds(crime_to_12062016_leftovers_geocode, "./data/crime_to_12062016_leftovers_geocode.rds")
#read the saved results
crime_to_12062016_leftovers_geocode <- read_rds("./data/crime_to_12062016_leftovers_geocode.rds")
#cut out the column we used for geocoding
crime_to_12062016_leftovers_geocode <-
crime_to_12062016_leftovers_geocode %>% select(-extracted_address)
#cut out of the geocode failures and put the location information into the columns lat and lon
crime_to_12062016_leftovers_geocode <-
remove_null_locations(crime_to_12062016_leftovers_geocode) %>%
unnest(location)
#create a simple features (sf) object, using the latititudes and longitudes
crime_to_12062016_leftovers_sf <- st_as_sf(crime_to_12062016_leftovers_geocode,
coords = c("lon", "lat"),
crs = st_crs(parcel_sf))
#append this simple features dataframe to the dataframe for which we already had locations
crime_to_12062016_sf <- rbind(crime_to_12062016_split[[1]], crime_to_12062016_leftovers_sf)
rm(crime_to_12062016, crime_to_12062016_leftovers_geocode, crime_to_12062016_leftovers_sf, crime_to_12062016_leftovers, crime_to_12062016_split)
#consider the examples in the recent crime data for which the conversion to sf didn't work, remove the two columns that we have added, and create and address column for geocoding
crime_12062016_to_03192018_leftovers <- crime_12062016_to_03192018_split[[2]] %>%
select(-extracted_lat, -extracted_lon) %>%
mutate(extracted_address = str_c(`Incident Address`, ", Detroit, Michigan"))
crime_12062016_to_03192018_geocode <- crime_12062016_to_03192018_leftovers %>%
mutate(location = map(extracted_address, geocode_pause))
write_rds(crime_12062016_to_03192018_geocode, "./data/crime_12062016_to_03192018_geocode.rds")
crime_12062016_to_03192018_geocode <- read_rds("./data/crime_12062016_to_03192018_geocode.rds") %>%
select(-extracted_address)
#remove the rows for which the value of location is NULL and then unnest the remaining locations
crime_12062016_to_03192018_geocode <-
remove_null_locations(crime_12062016_to_03192018_geocode) %>%
unnest(location)
#convert the dataframe to a simple features set
crime_12062016_to_03192018_sf <- st_as_sf(crime_12062016_to_03192018_geocode,
coords = c("lon", "lat"),
crs = st_crs(parcel_sf))
#combine the geocoded data with the sf dataframe created earlier
crime_12062016_to_03192018 <- rbind(crime_12062016_to_03192018_split[[1]], crime_12062016_to_03192018_sf)
rm(crime_12062016_to_03192018_split, crime_12062016_to_03192018_geocode, crime_12062016_to_03192018_leftovers, crime_12062016_to_03192018_sf)
#geocode the one item in the Improve Detroit Issues data for which the given coordinates were obviously incorrect, and then convert to an sf object. If geocoding fails, run this bit again
improve_detroit_issues_leftover_sf <- improve_detroit_issues_split[[2]] %>%
select(-extracted_lat, -extracted_lon) %>%
mutate(location = map(Address, geocode_pause)) %>%
unnest(location) %>%
st_as_sf(coords = c("lon", "lat"), crs = st_crs(parcel_sf))
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=14530%20%20Vaughan%20Detroit%2C%20Michigan
## Warning: geocode failed with status OVER_QUERY_LIMIT, location = "14530
## Vaughan Detroit, Michigan"
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=14530%20%20Vaughan%20Detroit%2C%20Michigan
#splice with the previously generated sf dataframe
improve_detroit_issues <- rbind(improve_detroit_issues_split[[1]], improve_detroit_issues_leftover_sf)
rm(improve_detroit_issues_split, improve_detroit_issues_leftover_sf)
#create another set of demolition information
completed_demolitions_sf <- completed_demolitions_split[[1]]
#note that location information in completed_demolitions_sf is complete (the dataframe for the examples with complete location information has zero elements)
completed_demolitions_split[[2]]
#another set of demolition information
upcoming_demolitions_sf <- upcoming_demolitions_split[[1]]
upcoming_demolitions_split[[2]]
rm(completed_demolitions, completed_demolitions_split, upcoming_demolitions, upcoming_demolitions_split)
We begin the process of signing labels to the buildings: blighted or not blighted. Buildings will be represented by parcels that have or have had buildings on them, whether by being so represented as in the parcels_sf
data frame as including structures or in the dismantle permits dataframe as having had a dismantle permit associated with it, thus suggesting that there was a building on the parcel.
We will use parcel numbers to refer to the parcels. However, as the following bit of code shows, the parcels dataset contains a few rows in which the parcel numbers are the same (duplicate_parcel_numbers_in_parcel_data
contains 78 rows).
#introduce a column of row numbers in the parcels data, for use in data cleaning
parcel_sf <- parcel_sf %>%
mutate(row_num = row_number())
#As per above, following returns a 78-row data frame
duplicate_parcel_numbers_in_parcel_data <-
parcel_sf %>%
group_by(parcelnum) %>%
mutate(n = n()) %>%
ungroup() %>%
filter(n > 1) %>%
select(parcelnum, address, legaldesc, row_num)
duplicate_parcel_numbers_in_parcel_data
Plotting the parcel data suggests that, within groups of parcels that have the same parcel number, some of the parcels cover the same area while others are disjoint. We thus a apply a spatial join, using sf::st_join
, to find pairs of parcels that cover the same area. After that, we modify parcel_sf
, to be maximal set that contains none of these duplicates.
#join the result with itself, on the basis of sameness of identity of spatial identity
spatial_repeats <- st_join(duplicate_parcel_numbers_in_parcel_data,
duplicate_parcel_numbers_in_parcel_data,
st_equals, left = FALSE) %>% filter(row_num.x != row_num.y)
#select one element from each group with the sf objects for which the polygon covers the same area
selection_vector <- 1:nrow(spatial_repeats)
for (index in 1:nrow(spatial_repeats)) {
selection_vector[index] <- !(spatial_repeats$row_num.x[index] %in%
spatial_repeats$row_num.y[1:index])
}
selection_vector <- as.logical(selection_vector)
#the unique parcels, as specified in unique_parcels
unique_parcels <- spatial_repeats[selection_vector,]
#the unique parcels, as specified in parcel_sf, with a row number added to the end of the parcel number (character vector)
unique_parcels <- parcel_sf %>% filter(row_num %in% unique_parcels$row_num.x) %>%
mutate(parcelnum = str_c(parcelnum, "_", row_number()))
#cut out the set of sf objects that have spatial repeats
parcel_sf <- parcel_sf %>% filter(!(row_num %in% spatial_repeats$row_num.x))
#bind the the set of unique spatial objects to parcel_sf
parcel_sf <- parcel_sf %>% rbind(unique_parcels)
Having cut out the duplicate parcels (parcels that cover the same area), we plot the groups (as it turns out, pairs) that that have the same parcel number.
library(tidyverse)
library(sf)
repeated_parcel_numbers <- parcel_sf %>% group_by(parcelnum) %>%
mutate(n = n()) %>% filter(n > 1) %>% ungroup()
repeated_parcel_numbers <- repeated_parcel_numbers %>% select(parcelnum) %>%
mutate(rownumber = row_number()) %>% mutate(plotted = FALSE)
for (row_1 in repeated_parcel_numbers$rownumber) {
if (repeated_parcel_numbers$plotted[row_1] == FALSE) {
for (row_2 in repeated_parcel_numbers$rownumber) {
if (row_2 != row_1 &
repeated_parcel_numbers$parcelnum[row_1] == repeated_parcel_numbers$parcelnum[row_2]) {
print(ggplot(repeated_parcel_numbers %>% filter(rownumber %in% c(row_1, row_2)) %>%
select(parcelnum)) + geom_sf() + ggtitle(repeated_parcel_numbers$parcelnum[row_1]))
repeated_parcel_numbers[c(row_1, row_2),]$plotted<- TRUE
}
}
}
}
rm(repeated_parcel_numbers)
Having verified that the repeats of the parcel numbers represent distinct parcels, modify the parcel numbers to make them distinct.
#add iteration numbers to parcelnum (a character variable) in the repeats within each set of repeasts
parcel_sf <- parcel_sf %>% group_by(parcelnum) %>% mutate(n = n(), repeat_number = row_number()) %>%
ungroup() %>%
mutate(parcelnum = ifelse(n > 1, str_c(parcelnum, "_", repeat_number), parcelnum)) %>%
select(-n, -repeat_number)
#test for repeats
temp <- parcel_sf %>% as.data.frame() %>% select(-geometry) %>%
group_by(parcelnum) %>% mutate(n = n()) %>% filter(n > 1)
temp
We now now use sf::st_join
, with st_overlaps
, to test for parcel overlap, with the result, according to the test, that there are several thousand pairs of parcels that overlap (share at least some portion of interior area).
#check for overlap of the parcels in parcel_sf
spatial_overlaps <- st_join(parcel_sf, parcel_sf,
st_overlaps, left = FALSE) %>% filter(row_num.x != row_num.y)
## although coordinates are longitude/latitude, st_overlaps assumes that they are planar
nrow(spatial_overlaps)
## [1] 9194
Investigating the cases of overlap (according to st_overlaps
) with plots of random selections of the supposedly overlapping pairs suggests that the apparent overlap is not significant (see below). It may be due to the fact that st_join
treats latitude and longitude values as planar coordinates. I will assume that parcels do not overlap.
#select a random sample of these cases of two parcels that overlap
set.seed(55)
sample <- sample(1:nrow(spatial_overlaps), 20)
parcel_selection <- spatial_overlaps[sample,]
#plot the elements of the random sample of pairs for which st_overlaps had indicated an overlap
for (index in 1:nrow(parcel_selection)) {
row_1 <- parcel_sf %>% select(parcelnum) %>%
filter(parcelnum == parcel_selection$parcelnum.x[index])
row_2 <- parcel_sf %>% select(parcelnum) %>%
filter(parcelnum == parcel_selection$parcelnum.y[index])
print(ggplot(rbind(row_1, row_2)) + geom_sf(aes(fill = parcelnum)))
}
rm(duplicate_parcel_numbers_in_parcel_data, selection_vector, unique_parcels, row_1, row_2, spatial_repeats, index, sample, parcel_selection, spatial_overlaps)
#(note that eval is set to false, to avoid rerunning this code every time I implement "run all chunks")
#investagate a few of the cases by road map
raod_map <- function(sf_shape) {
df <- tibble(longitude = st_coordinates(sf_shape)[,1],
latitude = st_coordinates(sf_shape)[,2])
#left/bottom/right/top for bounding box
bounding_box <- c(min(df$longitude) - 0.002, min(df$latitude) - 0.002,
max(df$longitude) + 0.002, max(df$latitude) + 0.002)
detroit_gg <- get_map(location = bounding_box,
maptype = "roadmap")
ggmap(detroit_gg)
}
street_map_1 <- raod_map(parcel_sf %>% filter(parcelnum == "13000116.003"))
## Warning: bounding box given to google - spatial extent only approximate.
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=42.341003,-83.011428&zoom=17&size=640x640&scale=2&maptype=roadmap&language=en-EN
street_map_1 + ggtitle("Parcel 13000116.003") + theme(plot.title = element_text(hjust = 0.5))
#investigate a few of the cases by satelite map
satellite_map <- function(sf_shape) {
df <- tibble(longitude = st_coordinates(sf_shape)[,1],
latitude = st_coordinates(sf_shape)[,2])
#left/bottom/right/top for bounding box
bounding_box <- c(min(df$longitude) - 0.002, min(df$latitude) - 0.002,
max(df$longitude) + 0.002, max(df$latitude) + 0.002)
detroit_gg <- get_map(location = bounding_box,
maptype = "satellite")
ggmap(detroit_gg)
}
street_map_2 <- satellite_map(parcel_sf %>% filter(parcelnum == "13000116.003"))
## Warning: bounding box given to google - spatial extent only approximate.
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=42.341003,-83.011428&zoom=17&size=640x640&scale=2&maptype=satellite&language=en-EN
street_map_2 + ggtitle("Parcel 13000116.003") + theme(plot.title = element_text(hjust = 0.5))
We begin the process of creating a list of buildings from the parcels data. One of the issues to consider is number of buildings on the parcel. Should we restrict our analysis to those parcels that have only one building? In any case, two columns in parcel_sf
bear on this aspect: building_s
and num_buildi
.
#exploration of the numbers of buildings on parcels
buildings_1 <- parcel_sf %>% filter(!is.na(building_s))
levels(buildings_1$building_s)
## [1] "1/2 DUPLEX" "2 STY COLONIAL" "APARTMENT"
## [4] "APT FLAT GARDEN" "APT HIGH RISE" "CARRIAGE HOUSE"
## [7] "DUPLEX" "FOUR FAMILY" "GARAGE/SHED"
## [10] "INCOME BUNGALOW" "LARGE FLATS" "LOFT APT WALKUP"
## [13] "MULTI DWELLING" "ROW HOUSE" "SINGLE FAMILY"
## [16] "TWO FAMILY FLAT"
buildings_2 <- parcel_sf %>% filter(num_buildi > 0)
buildings_3 <- parcel_sf %>% filter(!is.na(building_s) & num_buildi > 0)
buildings_4 <- parcel_sf %>% filter(is.na(building_s) & num_buildi > 0)
nrow(buildings_4)
## [1] 18749
buildings_5 <- parcel_sf %>% filter(!is.na(building_s) & !num_buildi > 0)
#print, but first cut out the geometry column to avoid an error message.
as.data.frame(buildings_5) %>% select(-geometry)
building_5 contains zero rows, thus indicating that buildings_1 is a subset of building_2. On the other hand, buildings_4–the set of parcels with, according to the data, at least one building but with the building-type unspecified (NA in building_s) contains 18,749 rows.
buildings_6 <- parcel_sf %>% filter(num_buildi > 1)
ggplot(buildings_1) + geom_bar(aes(x = as.factor(num_buildi)))
ggplot(buildings_4) + geom_bar(aes(x = as.factor(num_buildi)))
ggplot(buildings_6) + geom_bar(aes(x = building_s))
related <- buildings_1 %>% filter(!is.na(related_pa))
set.seed(27)
sample <- sample_n(related, 20)
dfs <- list()
for (index in 1:nrow(sample)) {
row_1 <- sample[index,]
row_2 <- parcel_sf %>% filter(parcelnum == sample$related_pa[index])
df <- rbind(row_1, row_2)
print(ggplot(df) + geom_sf(aes(fill = parcelnum)))
dfs[[index]] <- df
}
as.data.frame(dfs[[2]])
rm(buildings_1,buildings_2, buildings_3, buildings_4, buildings_5, buildings_6)
rm(row_1, row_2, sample, related, dfs, df)
I will ignore the “related parcels” (related_pa
) in parcel_sf
. Before making the final call as to what subset of the parcels dataframe will provide the stand-in for buildings, I will work on the data that we will use to form the labels.
Like the parcel_sf dataframe, the dismantle permits data also contains some duplicate parcel numbers over rows, in some cases over rows that contain address information suggesting that the location is different. (It also indicates that a few individual locations, identified with addresses, had more than one associated dismantle permit. This need not be problematic—a permit could expire before the work is carried out, or there could be more than one structure on a parcel.) These repetitions are in duplicate_parcel_numbers_over_distinct_addresses
, which contains 272 rows. I should also note that, in most of these cases with duplicate parcel numbers (and addresses indicating different locations), the recorded latitude and longitude are identical. We can thus infer that some of this location information is incorrect.
#repeated parcel numbers in the dismantle permits data
dup_par_num_in_dismantle_data <-
dismantle_permits_sf %>%
group_by(`Parcel Number`) %>%
mutate(n = n()) %>%
ungroup() %>%
filter(n > 1) %>%
select(`Parcel Number`, `Site Address`) %>%
arrange(`Parcel Number`)
rm(dup_par_num_in_dismantle_data)
#parcel numbers in the dismantle permits data that are distributed over disinct address strings
dup_par_num_over_distinct_addresses <-
dismantle_permits_sf %>%
group_by(`Parcel Number`) %>%
mutate(parcel_number_occurances = n()) %>%
ungroup() %>%
filter(parcel_number_occurances > 1) %>%
group_by(`Parcel Number`, `Site Address`) %>%
mutate(m = n()) %>%
filter(m < parcel_number_occurances) %>%
arrange(`Parcel Number`)
#remove extraneous variables and then geocode the dismantle site addresses
geocoded_duplicates <- dup_par_num_over_distinct_addresses %>%
as.data.frame %>%
select(-geometry) %>%
mutate(location = map(str_c(`Site Address`, ", Detroit, Michigan"), geocode_pause))
#avoid having to do the geocoding again when re-running the code
write_rds(geocoded_duplicates, "./data/geocoded_duplicates.rds")
Let’s have a look at the relationship between the numbers of buildings on the parcels, as indicated in parcel_sf
, and the numbers of times parcels occur in dup_par_num_over_distinct_addresses
(which was constructed from the dismantle permits data)
#read the saved geocoded file into R
geocoded_duplicates <- read_rds("./data/geocoded_duplicates.rds")
#unpack the column of dataframes created by the geocoding, remove rows with identical geocoding results, and convert the resulting dataframe into a set of sf objects
geocoded_duplicates <- remove_null_locations(geocoded_duplicates) %>%
unnest(location) %>% distinct(lon, lat, .keep_all = TRUE) %>%
st_as_sf(coords = c("lon", "lat"), crs = st_crs(parcel_sf))
#ivestigate the relatinship between number buildings on a parcel, as specified in parcel_sf, and number of occurrences of parcel numbers in geocoded_duplicates
joined_w_parcels_sf <- as.data.frame(geocoded_duplicates) %>%
group_by(`Parcel Number`) %>%
summarize(`number of locations` = n()) %>%
inner_join(as.data.frame(parcel_sf), by = c("Parcel Number" = "parcelnum"))
ggplot(joined_w_parcels_sf) +
geom_count(aes(x = as.factor(`number of locations`),
y = num_buildi))
#cut out the greater values of num_buildi, so as to widen the vertical scale
ggplot(joined_w_parcels_sf %>% filter(num_buildi < 11)) +
geom_count(aes(x = as.factor(`number of locations`),
y = as.factor(num_buildi)))
Note the large number of cases for which num_buildi
is equal to zero. If the data is accurate, then it is a reasonable guess that such cases reflect parcels for which num_build has been updated after all buildings have have been removed.
temp3 <- geocoded_duplicates %>% #select(`Parcel Number`, location, `Permit Location`) %>%
filter(`Parcel Number` == "13006809.")
#function for putting a set of of sf point on a satelite map
sf_points_map <- function(sf_df) {
df <- sf_df %>% mutate(longitude = st_coordinates(sf_df)[,1],
latitude = st_coordinates(sf_df)[,2]) %>%
as.data.frame %>% select(longitude, latitude)
#left/bottom/right/top for bounding box
bounding_box <- c(min(df$longitude) - 0.002, min(df$latitude) - 0.002,
max(df$longitude) + 0.002, max(df$latitude) + 0.002)
detroit_gg <- get_map(location = bounding_box,
maptype = "satellite")
ggmap(detroit_gg) + geom_point(data = df, aes(x = longitude, y = latitude),
color = "red")
}
#map of points with Parcel Number listed as "13006809."
sf_points_map(temp3)
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=42.410747,-83.043346&zoom=17&size=640x640&scale=2&maptype=satellite&language=en-EN
#note that "13006809." is not reflected as a parcel number in parcel_sf (noting that parcel_sf$parcelnum is an integer vector).
nrow(parcel_sf %>% filter(parcelnum == "13006809."))
## [1] 0
#spacial join to see where these points in the dismantled dataset hook up with the parcels dataset, with the result that the parcel contains three of seven points with `Parcel Number` = "13006809.". The are nearby, although some of may be closer to other parcels.
temp4 <- st_join(parcel_sf, temp3, left = FALSE, st_contains)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
temp4$`Parcel Number`
## [1] "13006809." "13006809." "13006809."
as.numeric(as.character(temp4$parcelnum))
## [1] 13006809 13006809 13006809
temp5 <- parcel_sf %>%
filter((!as.numeric(as.character(parcelnum)) < 13006809) &
as.numeric(as.character(parcelnum)) < 13006810) %>%
select(parcelnum)
as.data.frame(temp5$parcelnum)
#plot with parcel "13006809." on a satelite map
sf_points_map(temp3) +
geom_sf(data = temp5 %>% select(geometry), crs = 3857, inherit.aes = FALSE)
## converting bounding box to center/zoom specification. (experimental)
## Source : https://maps.googleapis.com/maps/api/staticmap?center=42.410747,-83.043346&zoom=17&size=640x640&scale=2&maptype=satellite&language=en-EN
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
rm(temp3, temp4, temp5, joined_w_parcels_sf, geocoded_duplicates)
rm(demolition_sample, geocoded_duplicates)
rm(temp, temp2)
It is notable that one of the positions associated with the dismantle permits is far enough from the the two parcels to be potentially closer to another parcel, which (upon further investigation with Google maps) appears to be adjacent to a vacant lot, on which a building, now dismantled, may have sat.
Further investigation of Parcel Number
duplicates in the dismantle permits data:
#read in the saved set of examples in the dismantle permits data that involve repeats of parcel numbers over different address strings.
dismantle_duplicates_geocoded <- read_rds("./data/geocoded_duplicates.rds")
#unpack the column of data frames (the outputs of geocode())
dismantle_duplicates_geocoded <- remove_null_locations(dismantle_duplicates_geocoded) %>%
unnest(location)
#note the 16 rows of dismantle_duplicates_geocoded for which another row lists the same longitude and latitude
dismantle_duplicates_geocoded %>%
group_by(lon, lat) %>% mutate(n = n()) %>% filter(n > 1)
#keep only the first of any group of rows with the same longitude and latitude
dismantle_duplicates_geocoded <-
dismantle_duplicates_geocoded %>% distinct(lon, lat, .keep_all = TRUE)
head(dismantle_duplicates_geocoded)
#manually cut out the one remaining apparent duplicate location
dismantle_duplicates_geocoded <- dismantle_duplicates_geocoded %>%
filter(!(`Site Address` == "3200 E LAFAYETTE-MARTIN LUTHER KING HIGH"))
dismantle_duplicates_geocoded <- dismantle_duplicates_geocoded %>%
st_as_sf(coords = c("lon", "lat"), crs = st_crs(parcel_sf)) %>%
select(-parcel_number_occurances, -m)
#where possible, find the parcels that contain the positions corresponding to the coordinates
spacial_join_within <-
st_join(dismantle_duplicates_geocoded,
parcel_sf %>% select(parcelnum),
st_within)
## although coordinates are longitude/latitude, st_within assumes that they are planar
#change the parcel numbers (for the dismantle permits data) to those for the parcels that contain the parcels specified by the coordinates. remove the parcelnum column (from the parcel_sf data frame)
dismantle_duplicates_geocoded <- spacial_join_within %>%
mutate(`Parcel Number` = ifelse(!is.na(parcelnum),
as.character(parcelnum),
`Parcel Number`)) %>%
select(-parcelnum)
#replace rows with duplicates of parcel numbers over distinct addresses with the cleaned set of rows, as per above
dismantle_permits_sf <- dismantle_permits_sf %>%
filter(!(`Parcel Number` %in% dup_par_num_over_distinct_addresses$`Parcel Number`)) %>%
rbind(dismantle_duplicates_geocoded)
rm(dismantle_duplicates_geocoded, dup_par_num_over_distinct_addresses, spacial_join_within)
rm(index)
Using Google Street View to investigate some more dismantle permit entries for which there are other entries with the same parcel number but different addresses, we find mostly different locations at the same building (e.g. the two sides of a duplex), or perhaps two attached buildings.
Continuing with the matter of assigning labels to the buildings, we now look at the completed demolitions data, completed_demolitions_sf. Although the initial idea was to use the dismantle permits data to assign the labels—blighted (to be dismantled) and not blighted (not to be dismantled)—we should consider using the completed demolitions data instead or in addition. Although this data may omit blighted buildings which, for whatever reason, have not been demolished or which, as suggested on the City-of-Detroit web-page from which the file was downloaded, may have been demolished in a hurry, the data appears to be relatively clean and genuinely reflective and blightedness. For example, if a new property owner dismantled a well-kept building for some reason, say to build a larger home, then, it may may seem, the demolition of this building would be reflected in the dismantle-permits dataset but not in the completed-demolitions dataset.
#look for repeats of "Parcel ID" on completed_demolitions_sf, finding none
as.data.frame(completed_demolitions_sf) %>%
select(-geometry) %>%
group_by(`Parcel ID`) %>%
mutate(n = n()) %>%
filter(n > 1)
#check to see how this data joins with the parcels data, noting that the inner join contains exactly seven rows less than completed_demolitions_sf contains
demolition_join_on_parcels <- as.data.frame(parcel_sf) %>% filter(!is.na(parcelnum)) %>%
inner_join(as.data.frame(completed_demolitions_sf %>% filter(!is.na(`Parcel ID`))),
by = c("parcelnum" = "Parcel ID"))
#look for repeats of parcelnum in demolition_join_on_parcels, finding none. we thus find near total agreement between the parcel_sf data frame and the completed demolitions data frame--given the information in parcel_sf, the parcel numbers in the completed demolitions dataset appear to be consistent with the location data.
as.data.frame(demolition_join_on_parcels) %>%
select(-geometry.x, -geometry.y) %>%
group_by(parcelnum) %>%
mutate(n = n()) %>%
filter(n > 1)
#have a look at the seven rows of completed_demolitions for which the `Parcel ID` did not match with a value of parcelnum in parcel_sf
dismantled_parcels_left_out <- completed_demolitions_sf %>%
filter(!`Parcel ID` %in% demolition_join_on_parcels$parcelnum)
#link these to the parcels data set, parcel_sf, using a spacial join
dismantled_left_out_joined <- dismantled_parcels_left_out %>%
st_join(parcel_sf %>% select(geometry), st_covered_by, left = FALSE)
## although coordinates are longitude/latitude, st_covered_by assumes that they are planar
rm(demolition_sample, demolition_join_on_parcels, dismantled_left_out_joined, dismantled_parcels_left_out)
## Warning in rm(demolition_sample, demolition_join_on_parcels,
## dismantled_left_out_joined, : object 'demolition_sample' not found
In an effort to make our predictors propertly relevant to what they are predicting, we will restrict our labels to the period from June of 2016. We thus need to convert some of the date information, which is in the form of a string, into a proper date format. Although three datasets are being used to the construct our labels, the conversion will be necessary for only two of the datasets.
require(lubridate)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
completed_demolitions_sf$`Demolition Date` <- mdy(completed_demolitions_sf$`Demolition Date`)
dismantle_permits_sf$`Permit Issued` <- mdy(dismantle_permits_sf$`Permit Issued`)
Our parcel list, the standins for buildings, will be formed from those parcels that satisfy one or more of the following conditions:
Having determined that the great majority of the parcels have only one building, we may guess that that the existence of a few parcels with repeats won’t have much of a distorting affect.
After construction of a list of parcels, we proceed to a set of labels—blighted or not blighted—for each parcel, based on the the building’s either having been dismantled or having had a dismantle permit associated with it. In associating dismantle permits and actual demolitions with individual parcels, I have prioritized spatial association (using sf::st_join
) over matching of parcel numbers. (That is, in the limited number of cases where these methods of association disagree, we use the spatial association.)
#the set of all rows of parcel_sf that are within one of the hardest hit areas
parcels_in_hardest_hit_areas <- parcel_sf %>%
st_join(Hardest_Hit_Fund_Areas, st_within, left = FALSE)
## although coordinates are longitude/latitude, st_within assumes that they are planar
#add a row number to completed_demolitions_sf, so as to keep an account of which rows have been included in the following joins
completed_demolitions_sf <- completed_demolitions_sf %>% mutate(rownumber = row_number())
#inner spacial join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis containment of the point associated with the demolition dataset, in the parcel
completed_in_fund_areas_parcel_spatial <- parcels_in_hardest_hit_areas %>%
st_join(completed_demolitions_sf, st_contains, left = FALSE)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
completed_in_fund_areas_parcel_spatial <- completed_in_fund_areas_parcel_spatial %>%
filter(`Demolition Date` > mdy("05/31/2016"))
#inner join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis of identify of parcel numbers, for those elements of completed_demolitions_sf that were not captured in the spatial join
completed_in_fund_areas_parcelnum_join <- parcels_in_hardest_hit_areas %>%
inner_join(as.data.frame(completed_demolitions_sf %>%
filter(! rownumber %in% completed_in_fund_areas_parcel_spatial$rownumber)),
by = c("parcelnum" = "Parcel ID"))
completed_in_fund_areas_parcelnum_join <- completed_in_fund_areas_parcelnum_join %>%
filter(`Demolition Date` > mdy("05/31/2016"))
#add a row number to completed_demolitions_sf, so as to keep an account of which rows have been included in the following joins
upcoming_demolitions_sf <- upcoming_demolitions_sf %>% mutate(rownumber = row_number())
#inner spacial join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis containment of the point associated with the demolition dataset, in the parcel
upcoming_in_fund_areas_parcel_spatial <- parcels_in_hardest_hit_areas %>%
st_join(upcoming_demolitions_sf, st_contains, left = FALSE)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
#inner join of parcels_in_hardest_hit_areas and completed_demolitions_sf, on the basis of identify of parcel numbers, for those elements of completed_demolitions_sf that were not captured in the spatial join
upcoming_in_fund_areas_parcelnum_join <- parcels_in_hardest_hit_areas %>%
inner_join(as.data.frame(upcoming_demolitions_sf %>%
filter(! rownumber %in% completed_in_fund_areas_parcel_spatial$rownumber)),
by = c("parcelnum" = "Parcel ID"))
#repeat the above three steps for the parcels that have dismantle permits associated with them
dismantle_permits_sf <- dismantle_permits_sf %>% mutate(rownumber = row_number())
dismantle_permit_fund_areas_spatial <- parcels_in_hardest_hit_areas %>%
st_join(dismantle_permits_sf, st_contains, left = FALSE)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
dismantle_permit_fund_areas_spatial <- dismantle_permit_fund_areas_spatial %>%
filter(`Permit Issued` > mdy("05/31/2016"))
dismantle_permit_fund_areas_parcelnum_join <- parcels_in_hardest_hit_areas %>%
inner_join(as.data.frame(dismantle_permits_sf %>%
filter(! rownumber %in% dismantle_permit_fund_areas_spatial$rownumber)),
by = c("parcelnum" = "Parcel Number"))
dismantle_permit_fund_areas_parcelnum_join <- dismantle_permit_fund_areas_parcelnum_join %>%
filter(`Permit Issued` > mdy("05/31/2016"))
#data frame with the parcel numbers of all parcels that have been at least one dismantled building or dismantle permit associated with it.
blighted_parcels <- rbind(as.data.frame(completed_in_fund_areas_parcel_spatial) %>% select(parcelnum),
as.data.frame(completed_in_fund_areas_parcelnum_join) %>% select(parcelnum),
as.data.frame(dismantle_permit_fund_areas_spatial) %>% select(parcelnum),
as.data.frame(dismantle_permit_fund_areas_parcelnum_join) %>% select(parcelnum),
as.data.frame(upcoming_in_fund_areas_parcel_spatial) %>% select(parcelnum),
as.data.frame(upcoming_in_fund_areas_parcelnum_join) %>% select(parcelnum)) %>%
distinct(parcelnum)
#data frame with the parcel numbers for all parcels in the hardest hit areas that have or have had at least one buiding on the parcel
parcels_set <- as.data.frame(blighted_parcels) %>%
select(parcelnum) %>%
rbind(as.data.frame(parcels_in_hardest_hit_areas) %>%
filter(num_buildi > 0) %>% select(parcelnum)) %>%
distinct(parcelnum)
#our labels
labels <- parcels_set %>% mutate(blighted = ifelse(parcelnum %in% blighted_parcels$parcelnum, 1, 0))
#note the ballance of blighted versus not-blighted labels.
labels %>% count(blighted) %>% mutate(fraction = n / sum(n))
rm(completed_in_fund_areas_parcel_spatial, completed_in_fund_areas_parcelnum_join, dismantle_permit_fund_areas_spatial, dismantle_permit_fund_areas_parcelnum_join, parcels_set, blighted_parcels, dismantle_permit_fund_areas_parcelnum_join, dismantle_permit_fund_areas_spatial, completed_in_fund_areas_parcelnum_join, completed_in_fund_areas_parcel_spatial, parcels_in_hardest_hit_areasparcels_in_hardest_hit_areasparcels_in_hardest_hit_areas, upcoming_in_fund_areas_parcel_spatial, upcoming_in_fund_areas_parcelnum_join)
We now construct a balanced set of labels, with roughly equal numbers of blighted and non-blighted examples. We then divide the set of parcel numbers into training and test sets, on a ratio of 0.8 to 0.2. Note that we will be using 10-fold cross validation on the training set, with the 20 percent of all of the items set asside for a final test of our model.
#note the ballance of blighted versus not-blighted labels.
labels %>% count(blighted) %>% mutate(fraction = n / sum(n))
labels_balanced <- labels %>% filter(blighted == 0) %>% sample_n(2200) %>%
rbind(labels %>% filter(blighted == 1))
set.seed(82)
training_rows <- caret::createDataPartition(labels_balanced$blighted, p = 0.85, list = FALSE)
training_parcelnums <- labels_balanced[training_rows,]$parcelnum
testing_parcelnums <- labels_balanced[-training_rows,]$parcelnum
Having explored and cleaned the data and constructed a set of labels, we proceed to the construction of predictive models. The first step will be to add some properties to the set of labels.
library(lubridate)
#add the parcel geometry to the labels
parcels_with_labels <- parcel_sf %>% select(parcelnum) %>%
inner_join(labels_balanced, by = "parcelnum")
#check to see that there are no parcel number repeats (finding none)
parcels_with_labels %>% as.data.frame() %>% select(-geometry) %>% count(parcelnum) %>% filter(n > 1)
#note that `Ticket ID` provides a key for blight_violations_sf (temp has zero elements)
temp <- blight_violations_sf %>% as.data.frame() %>% select(-geometry) %>%
count(`Ticket ID`) %>% filter(n > 1)
temp
rm(temp)
blight_violations_sf <- blight_violations_sf %>%
filter(mdy(`Violation Date`) > mdy("05/31/2009"))
#have a look at the distribution of violators recorded in blight_violations_sf for (as indicated by `Violator ID`), noting that the following construction contains zero rows (Violator ID thus doesn't seem to be useful)
violator_distribution <- blight_violations_sf %>%
as.data.frame() %>% select(-geometry) %>% count(`Violator ID`) %>% filter(n > 1)
violator_distribution
rm(violator_distribution)
#associate the blight violations data with parcels_with_labels, first with a spatial join and then, for any rows in blight_violations_sf for which the spatial association fails, with an inner join on the parcel numbers. recall that parcels_with_labels is restrcicted to the hardest hit areas
spatial_join <- parcels_with_labels %>%
st_join(blight_violations_sf, st_contains, left = FALSE) %>%
select(-`Violation Parcel ID`)
## although coordinates are longitude/latitude, st_contains assumes that they are planar
#make use of the key `Ticket ID` to find the leftovers
blight_violations_leftovers <- blight_violations_sf %>% filter(! `Ticket ID` %in% spatial_join$`Ticket ID`)
#now handle the leftovers
parcelnum_join <- parcels_with_labels %>%
inner_join((as.data.frame(blight_violations_leftovers) %>% select(-geometry)),
by = c("parcelnum" = "Violation Parcel ID"))
blight_violation_data <- rbind(spatial_join, parcelnum_join)
blight_violation_data <- inner_join(parcels_with_labels,
as.data.frame(blight_violation_data) %>% select(-geometry, -blighted),
by = "parcelnum")
rm(spatial_join, blight_violations_leftovers, parcelnum_join)
We consider a histogram of the blight data
#dataset with two calculated variables--total number of blight violations and tot4 perhaps to be used as predictors
blight_violation_tallies <- blight_violation_data %>% as.data.frame() %>% select(-geometry) %>%
mutate(`Fine Amount` = as.numeric(str_sub(`Fine Amount`, start = 2))) %>%
group_by(parcelnum, blighted) %>%
summarize(number_of_violations_by_parcel = n(), total_fines_by_parcel = sum(`Fine Amount`))
blight_violation_tallies <- as.tibble(blight_violation_tallies)
parcels_without_blight_violations <- labels_balanced %>%
filter(!parcelnum %in% blight_violation_tallies$parcelnum) %>%
mutate(number_of_violations_by_parcel = 0, total_fines_by_parcel = 0)
blight_violation_tallies <- bind_rows(blight_violation_tallies, parcels_without_blight_violations)
blight_violation_summary <- blight_violation_tallies %>%
filter(number_of_violations_by_parcel < 19) %>%
group_by(number_of_violations_by_parcel, blighted) %>% summarize(n = n())
ggplot(blight_violation_summary) +
geom_col(aes(x = number_of_violations_by_parcel,
y = n,
fill = as.factor(blighted)),
position = "dodge2")
rm(blight_violation_summary)
Now consider a histogram of the fine totals.
head(blight_violation_tallies)
blight_violation_tallies %>% filter(total_fines_by_parcel < 5000) %>%
ggplot(aes(x = total_fines_by_parcel, fill = as.factor(blighted))) +
geom_histogram(binwidth = 200)
We will next construct a simple logistic model, using the constructed dataframe that we used for the two plots above. The two predictors will be the number of blight violaitons and the total amount in fines.
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- blight_violation_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
models <- 1:10 %>% map(~ glm(blighted ~ total_fines_by_parcel + number_of_violations_by_parcel,
data = train[-k_folds[[.x]],], family = "binomial"))
predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
type = "response"))
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.5759393
sd(as.numeric(accuracies))
## [1] 0.02432749
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 163 136
## TRUE 24 41
## truth
## pred 0 1
## FALSE 155 118
## TRUE 37 54
## truth
## pred 0 1
## FALSE 166 131
## TRUE 22 46
## truth
## pred 0 1
## FALSE 154 142
## TRUE 23 45
## truth
## pred 0 1
## FALSE 156 125
## TRUE 30 53
## truth
## pred 0 1
## FALSE 161 129
## TRUE 20 54
## truth
## pred 0 1
## FALSE 176 130
## TRUE 18 40
## truth
## pred 0 1
## FALSE 165 105
## TRUE 33 61
## truth
## pred 0 1
## FALSE 143 145
## TRUE 24 52
## truth
## pred 0 1
## FALSE 160 121
## TRUE 31 52
#construct a grid of predictor values and then represent the predictions with colors
library(modelr)
number_of_violations_by_parcel <-
seq_range(train$number_of_violations_by_parcel, n = 10, pretty = TRUE, trim = 0.05)
total_fines_by_parcel <- seq_range(train$total_fines_by_parcel, n = 10, pretty = TRUE, trim = 0.05)
grid <- crossing(number_of_violations_by_parcel,
total_fines_by_parcel)
#have a look at the third model (among the k-fold validation models)
grid <- grid %>% add_predictions(models[[3]])
ggplot(grid) + geom_point(aes(x = number_of_violations_by_parcel,
y = total_fines_by_parcel, color = pred > 0.5))
Note that the models are much better at predicting negative instances (truth = 0) instances than predicting positive instances (true = 1). (It gets most of the positive instances wrong.)
We now compare this with a simple rpart (decision tree) model
library(rpart)
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- blight_violation_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
models <- 1:10 %>% map(~ rpart(blighted ~ total_fines_by_parcel + number_of_violations_by_parcel,
data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.5951626
sd(as.numeric(accuracies))
## [1] 0.03001074
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 158 100
## TRUE 28 78
## truth
## pred 0 1
## FALSE 146 97
## TRUE 40 81
## truth
## pred 0 1
## FALSE 143 99
## TRUE 44 79
## truth
## pred 0 1
## FALSE 141 108
## TRUE 45 70
## truth
## pred 0 1
## FALSE 142 99
## TRUE 44 79
## truth
## pred 0 1
## FALSE 145 111
## TRUE 41 67
## truth
## pred 0 1
## FALSE 139 105
## TRUE 47 73
## truth
## pred 0 1
## FALSE 142 98
## TRUE 44 80
## truth
## pred 0 1
## FALSE 138 111
## TRUE 48 67
## truth
## pred 0 1
## FALSE 132 111
## TRUE 54 67
#construct a grid of predictor values and then represent the predictions with colors
library(modelr)
number_of_violations_by_parcel <-
seq_range(train$number_of_violations_by_parcel, n = 10, pretty = TRUE, trim = 0.05)
total_fines_by_parcel <- seq_range(train$total_fines_by_parcel, n = 10, pretty = TRUE, trim = 0.05)
grid <- crossing(number_of_violations_by_parcel,
total_fines_by_parcel)
#have a look at the third model (among the k-fold validation models)
grid <- grid %>% add_predictions(models[[9]])
#plot of the predictions, for one of the models,
ggplot(grid) + geom_point(aes(x = number_of_violations_by_parcel,
y = total_fines_by_parcel, color = pred[,2] > 0.5))
models[[4]]
## n= 3277
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3277 1602 0 (0.5111382 0.4888618)
## 2) total_fines_by_parcel< 35 2216 931 0 (0.5798736 0.4201264) *
## 3) total_fines_by_parcel>=35 1061 390 1 (0.3675778 0.6324222) *
Note that the simple decision-tree model is somewhat better than the logistic model at teasing out the positive instances, and somewhat less effective with respect to the negative instances.
We now add the improve_detroit_issues dataset to the model. Note that the improve detroit issues dataset does not contain data for 2013 or earlier.
#note the possible issue types, and then restrict the set to include issues that would seem to be associated with blight in a particular building
levels(as.factor(improve_detroit_issues$`Issue Type`))
## [1] "Blocked Catch Basin"
## [2] "Cemetery Issue"
## [3] "Clogged Drain"
## [4] "Curbside Solid Waste Issue"
## [5] "Dead Animal Removal"
## [6] "DPW - Debris Removal - DPW USE ONLY"
## [7] "DPW - Other environmental - DPW USE ONLY"
## [8] "Fire Hydrant Issue"
## [9] "Illegal Dump Sites"
## [10] "Manhole Cover Issue"
## [11] "New LED Street Light Out"
## [12] "Other - Not within City jurisdiction"
## [13] "Other - Not within scope of City services"
## [14] "Other - Referred to other City Department"
## [15] "Park Issue"
## [16] "Potholes"
## [17] "Residential Snow Removal Issue"
## [18] "Rodent Extermination - BSEED Only"
## [19] "Running Water in a Home or Building"
## [20] "Squatters Issue"
## [21] "Squatters Issue - TEST ONLY"
## [22] "Street Light / Street Light Pole Major Repair"
## [23] "Street Light Out"
## [24] "Street Light Pole Down"
## [25] "Traffic Sign Issue"
## [26] "Traffic Signal Issue"
## [27] "Tree Issue"
## [28] "Water Main Break"
#we'll try to work with the folloiwng dataset, which has
improve_detroit_issues_property <- improve_detroit_issues %>%
filter(`Issue Type` %in% c("Residential Snow Removal Issue", "Running Water in a Home or Building",
"Curbside Solid Waste Issue", "DPW - Debris Removal - DPW USE ONLY",
"Rodent Extermination - BSEED Only", "Squatters Issue", "Illegal Dump Sites"))
#add the parcel geometry to the dataset with the the blight violation tallies
blight_violation_tallies <- parcels_with_labels %>% select(geometry, parcelnum) %>%
inner_join(blight_violation_tallies, by = "parcelnum")
#expand the parcel geometry to include all points within 200 meters of the parcel
library(units)
##
## Attaching package: 'units'
## The following object is masked from 'package:base':
##
## %*%
distance <- set_units(200, "m")
blight_violation_tallies <- blight_violation_tallies %>%
st_transform(2253) %>% st_buffer(distance)
improve_detroit_issues_tallies <- improve_detroit_issues_property %>%
st_transform(2253) %>%
st_join(blight_violation_tallies, join = st_within) %>%
as.tibble() %>% select(-geometry) %>% count(parcelnum)
#note the distribution of the tally numbers
improve_detroit_issues_tallies %>% filter(n < 200) %>% ggplot(aes(x = n)) +
geom_histogram(bins = 50) + labs(y = "count within 200 meters") +
ggtitle("Improve Detroit Issues Associated with Properties") +
theme(plot.title = element_text(hjust = 0.5))
#add the tallies to the improve_detroit_issues_tallies set
improve_detroit_and_violations_tallies <- blight_violation_tallies %>% as.data.frame() %>%
select(-geometry) %>% left_join(improve_detroit_issues_tallies, by = "parcelnum") %>%
mutate(improve_issues_tallies = ifelse(is.na(n), 0, n)) %>% select(-n)
We add this data to our models, and again try both decision-tree and logistic-regression methods.
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- improve_detroit_and_violations_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel + improve_issues_tallies
models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))
predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
type = "response"))
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.5784134
sd(as.numeric(accuracies))
## [1] 0.03085671
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 164 135
## TRUE 15 50
## truth
## pred 0 1
## FALSE 163 105
## TRUE 36 60
## truth
## pred 0 1
## FALSE 154 128
## TRUE 27 56
## truth
## pred 0 1
## FALSE 169 128
## TRUE 20 47
## truth
## pred 0 1
## FALSE 141 155
## TRUE 23 45
## truth
## pred 0 1
## FALSE 175 113
## TRUE 26 50
## truth
## pred 0 1
## FALSE 155 126
## TRUE 31 52
## truth
## pred 0 1
## FALSE 168 120
## TRUE 29 47
## truth
## pred 0 1
## FALSE 158 140
## TRUE 23 43
## truth
## pred 0 1
## FALSE 159 130
## TRUE 25 50
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- improve_detroit_and_violations_tallies %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
#use the same formula as in the logistic model above
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.5951648
sd(as.numeric(accuracies))
## [1] 0.03045012
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 142 105
## TRUE 44 73
## truth
## pred 0 1
## FALSE 151 105
## TRUE 35 73
## truth
## pred 0 1
## FALSE 149 108
## TRUE 38 70
## truth
## pred 0 1
## FALSE 149 103
## TRUE 37 75
## truth
## pred 0 1
## FALSE 148 99
## TRUE 38 79
## truth
## pred 0 1
## FALSE 149 98
## TRUE 37 80
## truth
## pred 0 1
## FALSE 132 116
## TRUE 54 62
## truth
## pred 0 1
## FALSE 135 98
## TRUE 51 80
## truth
## pred 0 1
## FALSE 132 92
## TRUE 54 86
## truth
## pred 0 1
## FALSE 139 115
## TRUE 47 63
We now try to work with the crime data
library(units)
library(lubridate)
#improve_detroit_and_violations_tallies <- improve_detroit_and_violations_tallies %>%
# as.data.frame() %>% select(-geometry)
#add the parcel geometry to the data we are using for modeling
improve_detroit_and_violations_tallies <- parcels_with_labels %>% select(-blighted) %>%
left_join(improve_detroit_and_violations_tallies, by = "parcelnum") %>% st_sf()
improve_detroit_and_violations_tallies <- improve_detroit_and_violations_tallies %>% st_sf()
distance <- set_units(200, "m") #for crimes within 200 meters
#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_later_crime <- improve_detroit_and_violations_tallies %>%
st_transform(2253) %>% st_buffer(distance)
#ggplot(improve_detroit_and_violations_tallies[100,]) + geom_sf()
tallies_with_later_crime <- tallies_with_later_crime %>%
st_join(crime_12062016_to_03192018 %>% st_transform(2253),
join = st_contains)
#count the number of recent recorded crimes within 500 meters, noting that every parcel has at least one
tallies_with_later_crime <- tallies_with_later_crime %>% as.data.frame() %>% select(-geometry) %>%
group_by(parcelnum) %>% summarize(later_recorded_crime_incidents = n())
#now tally the earlier crime incidents, by roughly repeating the last few steps
#add the parcel geometry to the data we are using for modeling
#tallies_with_earlier_crime <- parcels_with_labels %>% select(-blighted) %>%
# left_join(improve_detroit_and_violations_tallies, by = "parcelnum") %>% st_sf()
#improve_detroit_and_violations_tallies <- improve_detroit_and_violations_tallies %>% st_sf()
earlier_crime_sf <- crime_to_12062016_sf %>%
mutate(INCIDENTDATE = mdy(str_sub(INCIDENTDATE, 1, 8))) %>%
filter(INCIDENTDATE > mdy("12/31/2014")) %>%
mutate(general_type = case_when(
CATEGORY %in% c("AGGRAVATED ASSAULT", "ASSAULT", "HOMICIDE", "ROBBERY",
"ARSON", "DAMAGE TO PROPERTY") ~ "Violent Crime",
CATEGORY %in% c("BURGLARY", "STOLEN PROPERTY", "STOLEN VEHICLE",
"OTHER BURGLARY", "LARCENY") ~ "Property Crime",
CATEGORY %in% c("SEX OFFENSES", "SOLICITATION", "VAGRANCY (OTHER)") ~ "Nuiscance Offences"))
#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_earlier_property_crime <- improve_detroit_and_violations_tallies %>%
st_transform(2253) %>% st_buffer(distance)
tallies_with_earlier_property_crime <- tallies_with_earlier_property_crime %>%
st_join(earlier_crime_sf %>% filter(general_type == "Property Crime") %>% st_transform(2253),
join = st_contains)
#count the number of recent recorded crimes within 200 meters, noting that every parcel has at least one
tallies_with_earlier_property_crime <- tallies_with_earlier_property_crime %>% as.data.frame() %>% select(-geometry) %>%
group_by(parcelnum) %>% summarize(earlier_property_crime_incidents = n())
#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_earlier_violent_crime <- improve_detroit_and_violations_tallies %>%
st_transform(2253) %>% st_buffer(distance)
tallies_with_earlier_violent_crime <- tallies_with_earlier_violent_crime %>%
st_join(earlier_crime_sf %>% filter(general_type == "Violent Crime") %>% st_transform(2253),
join = st_contains)
#count the number of recent recorded crimes within 200 meters, noting that every parcel has at least one
tallies_with_earlier_violent_crime <- tallies_with_earlier_violent_crime %>%
as.data.frame() %>% select(-geometry) %>%
group_by(parcelnum) %>% summarize(earlier_violent_crime_incidents = n())
#enlarge the parcel geometries by the amount specified in distance, so as to use a spacial join to count events occurring near the parcel
tallies_with_earlier_nuiscance_crime <- improve_detroit_and_violations_tallies %>%
st_transform(2253) %>% st_buffer(distance)
tallies_with_earlier_nuiscance_crime <- tallies_with_earlier_nuiscance_crime %>%
st_join(earlier_crime_sf %>% filter(general_type == "Nuiscance Offences") %>% st_transform(2253),
join = st_contains)
#count the number of recent recorded crimes within 200 meters, noting that every parcel has at least one
tallies_with_earlier_nuiscance_crime <- tallies_with_earlier_nuiscance_crime %>%
as.data.frame() %>% select(-geometry) %>%
group_by(parcelnum) %>% summarize(earlier_nuiscance_offences = n())
tallies_with_crime <- tallies_with_later_crime %>%
inner_join(improve_detroit_and_violations_tallies, by = "parcelnum") %>%
inner_join(tallies_with_earlier_property_crime, by = "parcelnum") %>%
inner_join(tallies_with_earlier_violent_crime, by = "parcelnum") %>%
inner_join(tallies_with_earlier_nuiscance_crime, by = "parcelnum")
#rm(tallies_with_earlier_property_crime, tallies_with_later_crime, tallies_with_earlier_violent_crime)
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_crime %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
#n the number of reported crimtes
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel +
improve_issues_tallies + earlier_property_crime_incidents +
earlier_violent_crime_incidents + earlier_nuiscance_offences
models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))
predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
type = "response"))
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6457068
sd(as.numeric(accuracies))
## [1] 0.02623764
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 116 74
## TRUE 67 107
## truth
## pred 0 1
## FALSE 119 49
## TRUE 62 134
## truth
## pred 0 1
## FALSE 132 70
## TRUE 66 97
## truth
## pred 0 1
## FALSE 132 67
## TRUE 60 105
## truth
## pred 0 1
## FALSE 124 60
## TRUE 62 118
## truth
## pred 0 1
## FALSE 117 70
## TRUE 60 117
## truth
## pred 0 1
## FALSE 129 67
## TRUE 71 97
## truth
## pred 0 1
## FALSE 113 63
## TRUE 76 112
## truth
## pred 0 1
## FALSE 115 77
## TRUE 47 125
## truth
## pred 0 1
## FALSE 123 52
## TRUE 70 119
models[4]
## [[1]]
##
## Call: glm(formula = formula, family = "binomial", data = train[-k_folds[[.x]],
## ])
##
## Coefficients:
## (Intercept) total_fines_by_parcel
## 0.4616433 0.0002497
## number_of_violations_by_parcel improve_issues_tallies
## 0.0598077 0.0008865
## earlier_property_crime_incidents earlier_violent_crime_incidents
## -0.0160338 0.0100555
## earlier_nuiscance_offences
## 0.0746626
##
## Degrees of Freedom: 3276 Total (i.e. Null); 3270 Residual
## Null Deviance: 4542
## Residual Deviance: 4200 AIC: 4214
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_crime %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
#use the same formula as in the logistic model above
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6272972
sd(as.numeric(accuracies))
## [1] 0.02379004
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 117 72
## TRUE 69 106
## truth
## pred 0 1
## FALSE 106 49
## TRUE 80 129
## truth
## pred 0 1
## FALSE 116 61
## TRUE 71 117
## truth
## pred 0 1
## FALSE 105 68
## TRUE 81 110
## truth
## pred 0 1
## FALSE 105 48
## TRUE 81 130
## truth
## pred 0 1
## FALSE 105 54
## TRUE 81 124
## truth
## pred 0 1
## FALSE 124 58
## TRUE 62 120
## truth
## pred 0 1
## FALSE 96 48
## TRUE 90 130
## truth
## pred 0 1
## FALSE 102 54
## TRUE 84 124
## truth
## pred 0 1
## FALSE 106 66
## TRUE 80 112
models[[10]]
## n= 3277
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3277 1602 0 (0.5111382 0.4888618)
## 2) total_fines_by_parcel< 25 2206 926 0 (0.5802357 0.4197643)
## 4) earlier_property_crime_incidents>=166.5 1143 368 0 (0.6780402 0.3219598) *
## 5) earlier_property_crime_incidents< 166.5 1063 505 1 (0.4750706 0.5249294)
## 10) earlier_violent_crime_incidents< 180.5 846 415 0 (0.5094563 0.4905437)
## 20) earlier_property_crime_incidents>=118.5 364 131 0 (0.6401099 0.3598901) *
## 21) earlier_property_crime_incidents< 118.5 482 198 1 (0.4107884 0.5892116) *
## 11) earlier_violent_crime_incidents>=180.5 217 74 1 (0.3410138 0.6589862) *
## 3) total_fines_by_parcel>=25 1071 395 1 (0.3688142 0.6311858) *
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_crime %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
#Random Forest, using the same formula as in the logistic and rpart models (using 500 decision trees)
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6580581
sd(as.numeric(accuracies))
## [1] 0.02161093
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 138 77
## TRUE 48 101
## truth
## pred 0 1
## FALSE 126 63
## TRUE 60 115
## truth
## pred 0 1
## FALSE 133 67
## TRUE 54 111
## truth
## pred 0 1
## FALSE 116 72
## TRUE 70 106
## truth
## pred 0 1
## FALSE 139 77
## TRUE 47 101
## truth
## pred 0 1
## FALSE 134 69
## TRUE 52 109
## truth
## pred 0 1
## FALSE 135 61
## TRUE 51 117
## truth
## pred 0 1
## FALSE 132 66
## TRUE 54 112
## truth
## pred 0 1
## FALSE 134 79
## TRUE 52 99
## truth
## pred 0 1
## FALSE 133 73
## TRUE 53 105
Now add some constant properties of the parcels to the models, to see if they improve the model fit.
#add the parcel area, to see if it improves
tallies_with_area <- tallies_with_crime %>%
left_join(parcel_sf %>% as.data.frame() %>% select(parcelnum, total_acre),
by = "parcelnum") %>% select(-geometry)
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_area %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
#n the number of reported crimtes
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel +
improve_issues_tallies + earlier_property_crime_incidents +
earlier_violent_crime_incidents + total_acre + earlier_nuiscance_offences
models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))
predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
type = "response"))
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.645983
sd(as.numeric(accuracies))
## [1] 0.02557795
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 118 74
## TRUE 65 107
## truth
## pred 0 1
## FALSE 119 49
## TRUE 62 134
## truth
## pred 0 1
## FALSE 130 70
## TRUE 68 97
## truth
## pred 0 1
## FALSE 133 67
## TRUE 59 105
## truth
## pred 0 1
## FALSE 124 60
## TRUE 62 118
## truth
## pred 0 1
## FALSE 117 70
## TRUE 60 117
## truth
## pred 0 1
## FALSE 129 67
## TRUE 71 97
## truth
## pred 0 1
## FALSE 113 62
## TRUE 76 113
## truth
## pred 0 1
## FALSE 116 77
## TRUE 46 125
## truth
## pred 0 1
## FALSE 120 51
## TRUE 73 120
Try and test an rpart model
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_area %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
#use the same formula as with the glm model above
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6305916
sd(as.numeric(accuracies))
## [1] 0.02164158
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 118 70
## TRUE 68 108
## truth
## pred 0 1
## FALSE 106 47
## TRUE 80 131
## truth
## pred 0 1
## FALSE 112 54
## TRUE 75 124
## truth
## pred 0 1
## FALSE 111 71
## TRUE 75 107
## truth
## pred 0 1
## FALSE 118 62
## TRUE 68 116
## truth
## pred 0 1
## FALSE 113 54
## TRUE 73 124
## truth
## pred 0 1
## FALSE 120 61
## TRUE 66 117
## truth
## pred 0 1
## FALSE 101 52
## TRUE 85 126
## truth
## pred 0 1
## FALSE 128 78
## TRUE 58 100
## truth
## pred 0 1
## FALSE 115 77
## TRUE 71 101
models[[9]]
## n= 3277
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3277 1602 0 (0.5111382 0.4888618)
## 2) total_fines_by_parcel< 35 2220 940 0 (0.5765766 0.4234234)
## 4) earlier_property_crime_incidents>=121.5 1675 611 0 (0.6352239 0.3647761) *
## 5) earlier_property_crime_incidents< 121.5 545 216 1 (0.3963303 0.6036697)
## 10) total_acre>=0.1225 132 48 0 (0.6363636 0.3636364) *
## 11) total_acre< 0.1225 413 132 1 (0.3196126 0.6803874) *
## 3) total_fines_by_parcel>=35 1057 395 1 (0.3736991 0.6263009) *
Try and test a random forest model.
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_area %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
#use the same formula as with the glm and rpart models above
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6712336
sd(as.numeric(accuracies))
## [1] 0.03294328
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 136 67
## TRUE 50 111
## truth
## pred 0 1
## FALSE 123 58
## TRUE 63 120
## truth
## pred 0 1
## FALSE 140 59
## TRUE 47 119
## truth
## pred 0 1
## FALSE 117 64
## TRUE 69 114
## truth
## pred 0 1
## FALSE 135 72
## TRUE 51 106
## truth
## pred 0 1
## FALSE 135 58
## TRUE 51 120
## truth
## pred 0 1
## FALSE 137 62
## TRUE 49 116
## truth
## pred 0 1
## FALSE 142 62
## TRUE 44 116
## truth
## pred 0 1
## FALSE 126 72
## TRUE 60 106
## truth
## pred 0 1
## FALSE 126 79
## TRUE 60 99
#add the parcel area, to see if it improves
tallies_with_district <- tallies_with_area %>%
left_join(parcel_sf %>% as.data.frame() %>% select(parcelnum, council_di),
by = "parcelnum")
tallies_with_district$council_di <- as.character(tallies_with_district$council_di)
district_recording_errors <- tallies_with_district %>%
filter(! council_di %in% c("01", "02", "03", "04", "05", "06", "07"))
district_recording_errors <- parcels_with_labels %>% select(parcelnum) %>%
inner_join(district_recording_errors, by = "parcelnum")
district_recording_errors <- district_recording_errors %>%
st_join(city_council_districts %>% select(districts), join = st_within)
## although coordinates are longitude/latitude, st_within assumes that they are planar
district_recording_errors <- district_recording_errors %>%
mutate(council_di = districts) %>% as.data.frame() %>%
select(-districts, -geometry) %>% as.tibble()
tallies_with_district <- tallies_with_district %>%
filter(council_di %in% c("01", "02", "03", "04", "05", "06", "07")) %>%
rbind(district_recording_errors)
#with the obsevation that, for example, district 7 is currently represented by both of the strings "07" and "7", we make the representation constant
tallies_with_district <- tallies_with_district %>%
mutate(council_di = as.integer(council_di)) %>%
mutate(council_di = as.factor(council_di))
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_district %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel +
improve_issues_tallies + earlier_property_crime_incidents +
earlier_violent_crime_incidents + total_acre + council_di + earlier_nuiscance_offences
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6369028
sd(as.numeric(accuracies))
## [1] 0.02471345
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 103 44
## TRUE 83 134
## truth
## pred 0 1
## FALSE 105 42
## TRUE 81 136
## truth
## pred 0 1
## FALSE 126 58
## TRUE 61 120
## truth
## pred 0 1
## FALSE 107 59
## TRUE 79 119
## truth
## pred 0 1
## FALSE 126 74
## TRUE 60 104
## truth
## pred 0 1
## FALSE 112 76
## TRUE 74 102
## truth
## pred 0 1
## FALSE 110 56
## TRUE 76 122
## truth
## pred 0 1
## FALSE 130 70
## TRUE 56 108
## truth
## pred 0 1
## FALSE 109 58
## TRUE 77 120
## truth
## pred 0 1
## FALSE 110 62
## TRUE 76 116
models[[1]]
## n= 3277
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3277 1602 0 (0.5111382 0.4888618)
## 2) total_fines_by_parcel< 35 2228 942 0 (0.5771993 0.4228007)
## 4) total_acre>=0.1065 1067 338 0 (0.6832240 0.3167760) *
## 5) total_acre< 0.1065 1161 557 1 (0.4797588 0.5202412)
## 10) earlier_property_crime_incidents>=164.5 576 232 0 (0.5972222 0.4027778)
## 20) earlier_violent_crime_incidents< 264.5 350 108 0 (0.6914286 0.3085714) *
## 21) earlier_violent_crime_incidents>=264.5 226 102 1 (0.4513274 0.5486726) *
## 11) earlier_property_crime_incidents< 164.5 585 213 1 (0.3641026 0.6358974) *
## 3) total_fines_by_parcel>=35 1049 389 1 (0.3708294 0.6291706) *
Now try random forest, with the dataset and formula above.
#same formula as in rpart model above
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.686616
sd(as.numeric(accuracies))
## [1] 0.02196345
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 132 53
## TRUE 54 125
## truth
## pred 0 1
## FALSE 132 68
## TRUE 54 110
## truth
## pred 0 1
## FALSE 147 63
## TRUE 40 115
## truth
## pred 0 1
## FALSE 129 56
## TRUE 57 122
## truth
## pred 0 1
## FALSE 135 63
## TRUE 51 115
## truth
## pred 0 1
## FALSE 133 67
## TRUE 53 111
## truth
## pred 0 1
## FALSE 129 61
## TRUE 57 117
## truth
## pred 0 1
## FALSE 140 55
## TRUE 46 123
## truth
## pred 0 1
## FALSE 123 58
## TRUE 63 120
## truth
## pred 0 1
## FALSE 131 67
## TRUE 55 111
Now add the frontage (from parcel data). I wouldn’t expect this to add very much to the value of the model (given that we have already incuded area), but let’s see.
tallies_with_frontage <- tallies_with_district %>%
left_join(parcel_sf %>% as.data.frame() %>% select(parcelnum, frontage), by = "parcelnum")
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_frontage %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel +
improve_issues_tallies + earlier_property_crime_incidents +
earlier_violent_crime_incidents + total_acre + council_di + frontage + earlier_nuiscance_offences
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6498043
sd(as.numeric(accuracies))
## [1] 0.03416033
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 103 49
## TRUE 83 129
## truth
## pred 0 1
## FALSE 123 53
## TRUE 63 125
## truth
## pred 0 1
## FALSE 132 50
## TRUE 55 128
## truth
## pred 0 1
## FALSE 105 61
## TRUE 81 117
## truth
## pred 0 1
## FALSE 116 54
## TRUE 70 124
## truth
## pred 0 1
## FALSE 118 65
## TRUE 68 113
## truth
## pred 0 1
## FALSE 119 62
## TRUE 67 116
## truth
## pred 0 1
## FALSE 124 55
## TRUE 62 123
## truth
## pred 0 1
## FALSE 106 51
## TRUE 80 127
## truth
## pred 0 1
## FALSE 104 64
## TRUE 82 114
models[[2]]
## n= 3277
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3277 1602 0 (0.5111382 0.4888618)
## 2) total_fines_by_parcel< 35 2220 942 0 (0.5756757 0.4243243)
## 4) frontage>=35.5 1296 423 0 (0.6736111 0.3263889) *
## 5) frontage< 35.5 924 405 1 (0.4383117 0.5616883)
## 10) earlier_property_crime_incidents>=133.5 550 256 0 (0.5345455 0.4654545)
## 20) council_di=3,6,7 262 96 0 (0.6335878 0.3664122) *
## 21) council_di=1,2,4,5 288 128 1 (0.4444444 0.5555556)
## 42) earlier_property_crime_incidents>=179.5 168 76 0 (0.5476190 0.4523810)
## 84) earlier_violent_crime_incidents< 260.5 92 26 0 (0.7173913 0.2826087) *
## 85) earlier_violent_crime_incidents>=260.5 76 26 1 (0.3421053 0.6578947) *
## 43) earlier_property_crime_incidents< 179.5 120 36 1 (0.3000000 0.7000000) *
## 11) earlier_property_crime_incidents< 133.5 374 111 1 (0.2967914 0.7032086) *
## 3) total_fines_by_parcel>=35 1057 397 1 (0.3755913 0.6244087) *
Random forest again:
#same formula as in rpart model above
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6855163
sd(as.numeric(accuracies))
## [1] 0.02510488
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 124 51
## TRUE 62 127
## truth
## pred 0 1
## FALSE 122 66
## TRUE 64 112
## truth
## pred 0 1
## FALSE 138 53
## TRUE 49 125
## truth
## pred 0 1
## FALSE 121 53
## TRUE 65 125
## truth
## pred 0 1
## FALSE 131 55
## TRUE 55 123
## truth
## pred 0 1
## FALSE 132 58
## TRUE 54 120
## truth
## pred 0 1
## FALSE 127 53
## TRUE 59 125
## truth
## pred 0 1
## FALSE 131 47
## TRUE 55 131
## truth
## pred 0 1
## FALSE 114 51
## TRUE 72 127
## truth
## pred 0 1
## FALSE 123 60
## TRUE 63 118
We look at the effect of parcels in the region with no buildings on them. Many of these will be lots on which buildings have been dismantled. (However, could some of them be public parks or the like?)
distance <- set_units(200, "m") #for crimes within 200 meters
vacant_parcel_buffers <- parcel_sf %>% select(parcelnum) %>%
inner_join(tallies_with_frontage, by = "parcelnum") %>%
st_transform(2253) %>% st_buffer(distance)
vacant_parcel_join <- vacant_parcel_buffers %>%
st_join(parcel_sf %>% st_transform(2253) %>% filter(num_buildi == 0) %>% select(parcelnum),
join = st_contains) %>%
filter(parcelnum.x != parcelnum.y) %>%
group_by(parcelnum.x) %>% summarize(num_vacant_parcels = n())
tallies_with_vacant_parcels <- tallies_with_frontage %>%
left_join(vacant_parcel_join %>% as.data.frame() %>% select(-geometry),
by = c("parcelnum" = "parcelnum.x"))
tallies_with_vacant_parcels <- tallies_with_vacant_parcels %>%
mutate(num_vacant_parcels = ifelse(is.na(num_vacant_parcels), 0, num_vacant_parcels))
Model with rpart:
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_vacant_parcels %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel +
improve_issues_tallies + earlier_property_crime_incidents +
earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
earlier_nuiscance_offences
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6772859
sd(as.numeric(accuracies))
## [1] 0.02237345
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 122 55
## TRUE 64 123
## truth
## pred 0 1
## FALSE 120 61
## TRUE 66 117
## truth
## pred 0 1
## FALSE 119 49
## TRUE 68 129
## truth
## pred 0 1
## FALSE 96 39
## TRUE 90 139
## truth
## pred 0 1
## FALSE 123 37
## TRUE 63 141
## truth
## pred 0 1
## FALSE 112 47
## TRUE 74 131
## truth
## pred 0 1
## FALSE 119 52
## TRUE 67 126
## truth
## pred 0 1
## FALSE 110 42
## TRUE 76 136
## truth
## pred 0 1
## FALSE 114 40
## TRUE 72 138
## truth
## pred 0 1
## FALSE 124 51
## TRUE 62 127
models[[7]]
## n= 3277
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3277 1602 0 (0.51113824 0.48886176)
## 2) num_vacant_parcels< 41.5 1672 525 0 (0.68600478 0.31399522)
## 4) total_fines_by_parcel< 35 1139 261 0 (0.77085162 0.22914838) *
## 5) total_fines_by_parcel>=35 533 264 0 (0.50469043 0.49530957)
## 10) frontage>=79 22 2 0 (0.90909091 0.09090909) *
## 11) frontage< 79 511 249 1 (0.48727984 0.51272016)
## 22) total_fines_by_parcel< 512.5 262 113 0 (0.56870229 0.43129771) *
## 23) total_fines_by_parcel>=512.5 249 100 1 (0.40160643 0.59839357) *
## 3) num_vacant_parcels>=41.5 1605 528 1 (0.32897196 0.67102804) *
Now try random forest
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7061185
sd(as.numeric(accuracies))
## [1] 0.0239571
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 119 48
## TRUE 67 130
## truth
## pred 0 1
## FALSE 127 63
## TRUE 59 115
## truth
## pred 0 1
## FALSE 137 49
## TRUE 50 129
## truth
## pred 0 1
## FALSE 124 46
## TRUE 62 132
## truth
## pred 0 1
## FALSE 135 45
## TRUE 51 133
## truth
## pred 0 1
## FALSE 130 53
## TRUE 56 125
## truth
## pred 0 1
## FALSE 131 56
## TRUE 55 122
## truth
## pred 0 1
## FALSE 135 46
## TRUE 51 132
## truth
## pred 0 1
## FALSE 128 42
## TRUE 58 136
## truth
## pred 0 1
## FALSE 132 59
## TRUE 54 119
With the idea that blighted buildings may tend to come in clusters, we may guess that blightedness in one buidling may indicate that nearby buildings are more likely to become blighted. We thus, for each parcel we are using for training and testing, count the number of buildings that became blighted before June of 2016, before the time over which we are attempting carry out our predictions.
library(lubridate)
#identify the earlier completed demolotions and put the results into a form that can be combined with the dismantle permits results
earlier_completed_demolitions_sf <- completed_demolitions_sf %>%
mutate(Date = `Demolition Date`, parcelnum = `Parcel ID`) %>%
select(Date, parcelnum) %>% filter(Date < mdy("06/01/2016"))
#repeat for the dismantle permit results
earlier_dismantle_permits_sf <- dismantle_permits_sf %>%
mutate(Date = `Permit Issued`, parcelnum = `Parcel Number`) %>%
select(Date, parcelnum) %>% filter(Date < mdy("06/01/2016"))
#combine the above results, and ensure that parcels being counted are not the parcels that we are using for our modeling dataset
earlier_blighted_parcels <- rbind(earlier_completed_demolitions_sf, earlier_dismantle_permits_sf) %>%
as.data.frame() %>% select(-geometry) %>% distinct(parcelnum) %>%
filter(! parcelnum %in% tallies_with_vacant_parcels$parcelnum)
#add the parcel geometry to the set of earlier blighted parcels
earlier_blighted_parcels <- parcel_sf %>% select(parcelnum) %>%
inner_join(earlier_blighted_parcels, by = "parcelnum")
#set a distance of 100 meters
distance <- set_units(100, "m") #for setting a buffer of 100 meter around each relevant parcel
#expand the geometry for each parcel by 100 meters
buffered_parcels <- parcel_sf %>% select(parcelnum) %>%
inner_join(tallies_with_vacant_parcels, by = "parcelnum") %>%
st_transform(2253) %>% st_buffer(distance)
#count the numbers of blighted parcels within the expanded geometry (excluding the parcel for which we are counting)
tallies_of_earlier_blighted_parcels <- buffered_parcels %>% select(parcelnum) %>%
st_join(earlier_blighted_parcels %>% st_transform(2253),
join = st_contains, left = FALSE) %>% filter(parcelnum.x != parcelnum.y)
tallies_of_earlier_blighted_parcels <- tallies_of_earlier_blighted_parcels %>%
as.data.frame() %>% select(-geometry) %>%
group_by(parcelnum.x) %>% summarize(num_nearby_blighted_parcels = n())
#our modeling dataset, now with the new quantity
tallies_with_earlier_blighted_parcels <- tallies_with_vacant_parcels %>%
left_join(tallies_of_earlier_blighted_parcels, by = c("parcelnum" = "parcelnum.x")) %>%
mutate(num_nearby_blighted_parcels = ifelse(is.na(num_nearby_blighted_parcels), 0,
num_nearby_blighted_parcels))
rm(earlier_completed_demolitions_sf, earlier_dismantle_permits_sf, buffered_parcels, earlier_blighted_parcels,
tallies_of_earlier_blighted_parcels)
We now try an rpart model, now including the earlier demolition data.
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_earlier_blighted_parcels %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(294)
k_folds <- caret::createFolds(train$blighted)
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel +
improve_issues_tallies + earlier_property_crime_incidents +
earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
num_nearby_blighted_parcels + earlier_nuiscance_offences
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6970548
sd(as.numeric(accuracies))
## [1] 0.02505895
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 110 38
## TRUE 76 140
## truth
## pred 0 1
## FALSE 107 42
## TRUE 79 136
## truth
## pred 0 1
## FALSE 119 34
## TRUE 68 144
## truth
## pred 0 1
## FALSE 130 49
## TRUE 56 129
## truth
## pred 0 1
## FALSE 126 31
## TRUE 60 147
## truth
## pred 0 1
## FALSE 129 55
## TRUE 57 123
## truth
## pred 0 1
## FALSE 103 34
## TRUE 83 144
## truth
## pred 0 1
## FALSE 102 36
## TRUE 84 142
## truth
## pred 0 1
## FALSE 107 30
## TRUE 79 148
## truth
## pred 0 1
## FALSE 107 33
## TRUE 79 145
models[[7]]
## n= 3277
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3277 1602 0 (0.5111382 0.4888618)
## 2) num_nearby_blighted_parcels< 1.5 1561 456 0 (0.7078796 0.2921204)
## 4) num_vacant_parcels< 50.5 1310 307 0 (0.7656489 0.2343511) *
## 5) num_vacant_parcels>=50.5 251 102 1 (0.4063745 0.5936255) *
## 3) num_nearby_blighted_parcels>=1.5 1716 570 1 (0.3321678 0.6678322) *
Now try random forest:
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7203952
sd(as.numeric(accuracies))
## [1] 0.02796726
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 125 45
## TRUE 61 133
## truth
## pred 0 1
## FALSE 132 60
## TRUE 54 118
## truth
## pred 0 1
## FALSE 142 42
## TRUE 45 136
## truth
## pred 0 1
## FALSE 126 38
## TRUE 60 140
## truth
## pred 0 1
## FALSE 140 47
## TRUE 46 131
## truth
## pred 0 1
## FALSE 136 45
## TRUE 50 133
## truth
## pred 0 1
## FALSE 128 56
## TRUE 58 122
## truth
## pred 0 1
## FALSE 120 45
## TRUE 66 133
## truth
## pred 0 1
## FALSE 133 38
## TRUE 53 140
## truth
## pred 0 1
## FALSE 126 49
## TRUE 60 129
Now we add the fines for blight violations for nearby parcels.
parcels <- parcel_sf %>% select(parcelnum) %>%
inner_join(tallies_with_earlier_blighted_parcels %>% select(parcelnum),
by = "parcelnum")
distance <- distance <- set_units(100, "m")
buffered_parcels <- parcels %>% st_transform(2253) %>% st_buffer(distance)
blight_violation_points_in_buffer <- blight_violations_sf %>%
st_transform(2253) %>% st_join(buffered_parcels, join = st_within) %>%
filter(`Violation Parcel ID` != parcelnum)
violations_at_neighboring_parcels <- blight_violation_points_in_buffer %>%
group_by(parcelnum) %>%
summarize(num_violations_nearby_parcels = n(),
sum_fines_nearby_parcels = sum(as.numeric(str_sub(`Fine Amount`, start = 2)), na.rm = TRUE))
tallies_with_nearby_violations <- tallies_with_earlier_blighted_parcels %>%
left_join(violations_at_neighboring_parcels %>% as.data.frame() %>% select(-geometry),
by = "parcelnum")
tallies_with_nearby_violations <- tallies_with_nearby_violations %>%
mutate(sum_fines_nearby_parcels =
ifelse(is.na(sum_fines_nearby_parcels), 0, sum_fines_nearby_parcels),
num_violations_nearby_parcels =
ifelse(is.na(num_violations_nearby_parcels), 0, num_violations_nearby_parcels))
We use this dataset to construct an rpart model:
#separate the examples to be used for the training from the twenty percent of the examples to be withheld for final testing (remembering that we will do 10-fold cross validation over the training set)
train <- tallies_with_nearby_violations %>% filter(parcelnum %in% training_parcelnums)
#test <- blight_violation_tallies %>% filter(parcelnum %in% testing_parcelnums)
train$blighted <- as.factor(train$blighted)
#partition the training set into ten subsets, while maintaining a ballance between expamples labeled as blighted and examples not so labeled
set.seed(515)
k_folds <- caret::createFolds(train$blighted)
formula <- blighted ~ total_fines_by_parcel + number_of_violations_by_parcel +
improve_issues_tallies + earlier_property_crime_incidents +
earlier_violent_crime_incidents + total_acre + council_di + frontage + num_vacant_parcels +
num_nearby_blighted_parcels + num_violations_nearby_parcels + earlier_nuiscance_offences
models <- 1:10 %>% map(~ rpart(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.6915663
sd(as.numeric(accuracies))
## [1] 0.02177809
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 107 30
## TRUE 80 148
## truth
## pred 0 1
## FALSE 108 34
## TRUE 78 144
## truth
## pred 0 1
## FALSE 101 38
## TRUE 85 140
## truth
## pred 0 1
## FALSE 127 55
## TRUE 59 123
## truth
## pred 0 1
## FALSE 112 40
## TRUE 74 138
## truth
## pred 0 1
## FALSE 122 53
## TRUE 64 125
## truth
## pred 0 1
## FALSE 123 34
## TRUE 63 144
## truth
## pred 0 1
## FALSE 110 27
## TRUE 76 151
## truth
## pred 0 1
## FALSE 131 67
## TRUE 55 111
## truth
## pred 0 1
## FALSE 110 35
## TRUE 76 143
models[[3]]
## n= 3277
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3277 1602 0 (0.5111382 0.4888618)
## 2) num_nearby_blighted_parcels< 1.5 1557 456 0 (0.7071291 0.2928709)
## 4) num_vacant_parcels< 51.5 1316 307 0 (0.7667173 0.2332827) *
## 5) num_vacant_parcels>=51.5 241 92 1 (0.3817427 0.6182573) *
## 3) num_nearby_blighted_parcels>=1.5 1720 574 1 (0.3337209 0.6662791) *
models <- 1:10 %>% map(~ glm(formula, data = train[-k_folds[[.x]],], family = "binomial"))
predictions <- 1:10 %>% map(~ predict.glm(models[[.x]], newdata = train[k_folds[[.x]],],
type = "response"))
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.7058453
sd(as.numeric(accuracies))
## [1] 0.01969066
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 142 56
## TRUE 45 122
## truth
## pred 0 1
## FALSE 140 65
## TRUE 46 113
## truth
## pred 0 1
## FALSE 131 58
## TRUE 55 120
## truth
## pred 0 1
## FALSE 138 60
## TRUE 48 118
## truth
## pred 0 1
## FALSE 131 56
## TRUE 55 122
## truth
## pred 0 1
## FALSE 142 61
## TRUE 44 117
## truth
## pred 0 1
## FALSE 152 64
## TRUE 34 114
## truth
## pred 0 1
## FALSE 145 54
## TRUE 41 124
## truth
## pred 0 1
## FALSE 135 66
## TRUE 51 112
## truth
## pred 0 1
## FALSE 140 66
## TRUE 46 112
models[[9]]
##
## Call: glm(formula = formula, family = "binomial", data = train[-k_folds[[.x]],
## ])
##
## Coefficients:
## (Intercept) total_fines_by_parcel
## -0.862438 0.000235
## number_of_violations_by_parcel improve_issues_tallies
## 0.110902 0.001646
## earlier_property_crime_incidents earlier_violent_crime_incidents
## -0.007130 0.005644
## total_acre council_di2
## 0.088485 -0.235041
## council_di3 council_di4
## -0.133384 0.020426
## council_di5 council_di6
## -0.572968 -0.343677
## council_di7 frontage
## -0.193995 -0.005878
## num_vacant_parcels num_nearby_blighted_parcels
## 0.014407 0.174176
## num_violations_nearby_parcels earlier_nuiscance_offences
## -0.004899 0.026017
##
## Degrees of Freedom: 3276 Total (i.e. Null); 3259 Residual
## Null Deviance: 4541
## Residual Deviance: 3682 AIC: 3718
Now try random forest:
models <- 1:10 %>% map(~ randomForest(formula, data = train[-k_folds[[.x]],]))
predictions <- 1:10 %>% map(~ predict(models[[.x]], newdata = train[k_folds[[.x]],], type = "class"))
predictions <- 1:10 %>% map(~as.numeric(predictions[[.x]]) - 1)
accuracies <- 1:10 %>%
map(~ mean(as.numeric(! predictions[[.x]] < 0.5) == train[k_folds[[.x]],]$blighted))
#summary statistics over the models
mean(as.numeric(accuracies))
## [1] 0.73112
sd(as.numeric(accuracies))
## [1] 0.01820602
#confusion matrices for the models
for (index in 1:10) {
print(table(pred = (predictions[[index]] > 0.5), truth = train[k_folds[[index]],]$blighted))
}
## truth
## pred 0 1
## FALSE 129 43
## TRUE 58 135
## truth
## pred 0 1
## FALSE 130 46
## TRUE 56 132
## truth
## pred 0 1
## FALSE 128 46
## TRUE 58 132
## truth
## pred 0 1
## FALSE 135 41
## TRUE 51 137
## truth
## pred 0 1
## FALSE 127 34
## TRUE 59 144
## truth
## pred 0 1
## FALSE 131 48
## TRUE 55 130
## truth
## pred 0 1
## FALSE 134 44
## TRUE 52 134
## truth
## pred 0 1
## FALSE 139 37
## TRUE 47 141
## truth
## pred 0 1
## FALSE 133 52
## TRUE 53 126
## truth
## pred 0 1
## FALSE 133 46
## TRUE 53 132
models[[3]]
##
## Call:
## randomForest(formula = formula, data = train[-k_folds[[.x]], ])
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 27.1%
## Confusion matrix:
## 0 1 class.error
## 0 1181 494 0.2949254
## 1 394 1208 0.2459426
Save to file file we are using for modeling to disk, for use in other scripts
tallies_with_vagrancy_at_property <- tallies_with_nearby_violations %>%
inner_join(parcel_sf %>% select(parcelnum, geometry), by = "parcelnum") %>%
st_sf() %>% st_join(earlier_crime_sf %>% filter(CATEGORY == "VAGRANCY (OTHER)"),
join = st_contains, left = FALSE) %>%
group_by(parcelnum) %>% summarize(arson_incidents_at_property = n())
rm(tallies_with_vagrancy_at_property, tallies_with_arson_at_property)
write_rds(tallies_with_nearby_violations, "calculated_tallies.rds")
write_rds(training_parcelnums, "training_parcelnums.rds")