library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(modelr)
options(na.action = na.warn)
library(nycflights13)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
diamonds2 <- diamonds %>%
mutate(lcarat = log2(carat),
lprice = log2(price))
ggplot(diamonds2) + geom_point(aes(x = lcarat, y = lprice), alpha = 0.03)
The vertical strips represent round numbers, together with the fact that carat is measured to only two decimal places.
Applying the exponential function to both sides gives
price
= exp(a_0) * exp(log(carat) * a_1)
and thus
price
= exp(a_0) * carat ^ a_1
If carat is multiplied by a number r
, then the new price is
exp(a_0) * (carat * r) ^ a_1
= exp(a_0) * (carat ^ a_1) * (r ^ a_1)
= price * (r ^ a_1)
(price
here being the old price)
Therefore, if carat
is multiplied by r
, then the price is multiplied by (r ^ a_1)
Implement the model and add and plot the residuals, as per discussion in the text:
mod_diamond2 <- lm(lprice ~ lcarat + color + cut + clarity, data = diamonds2)
diamonds2 <- diamonds2 %>%
add_residuals(mod_diamond2, "lresid2")
ggplot(diamonds2, aes(lcarat, lresid2)) +
geom_hex(bins = 50)
Extract the the elements for which abs(lresid2)
> 0.70, and plot with carat rather than lcarat:
diamonds_high_residuals <- diamonds2 %>%
filter(abs(lresid2) > 0.75)
ggplot(diamonds_high_residuals, aes(carat, lresid2)) +
geom_hex(bins = 50) + labs(y = "residual for log2(price)")
We see an odd As suggested in the above two plots, for the larger (absolute value) residuals for lprice, the positive differences have strong tendency to correspond to lesser values of
library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
diamonds_high_residuals <- diamonds_high_residuals %>%
mutate(`log residual price:` = ifelse(lresid2 > 0.75,
"above 0.75",
"below 0.75"))
plt_high_residuals <- ggplot(diamonds_high_residuals) +
geom_bar(aes(x = cut, fill = `log residual price:`),
stat = "count") +
theme(legend.position = "bottom") +
labs(title = "counts for high residuals")
plt_all <- ggplot(diamonds2) + geom_bar(aes(x = cut), stat = "count") +
labs(title = "counts for all")
stacked_plot <- function(gg_1, gg_2) {
p1 <- ggplot_gtable(ggplot_build(gg_1))
p2 <- ggplot_gtable(ggplot_build(gg_2))
#to line-up the the x-axes, find the maximum of the two widths
maxWidth <- unit.pmax(p1$widths, p2$widths)
#set the widths of both plots to this maximum
p1$widths <- maxWidth
p2$widths <- maxWidth
#output the stacked plots
return (grid.arrange(p1, p2))
}
stacked_plot(plt_all, plt_high_residuals)
plt_residuals <- ggplot(diamonds_high_residuals) +
geom_bar(aes(x = color, fill = `log residual price:`),
stat = "count") +
theme(legend.position = "bottom") +
labs(title = "counts for high residuals")
plt_all <- ggplot(diamonds2) + geom_bar(aes(x = color), stat = "count") +
labs(title = "counts for all")
stacked_plot(plt_all, plt_residuals)
plt_residuals <- ggplot(diamonds_high_residuals) +
geom_bar(aes(x = clarity, fill = `log residual price:`),
stat = "count") +
theme(legend.position = "bottom") +
labs(title = "counts for high residuals")
plt_all <- ggplot(diamonds2) + geom_bar(aes(x = clarity), stat = "count") +
labs(title = "counts for all")
stacked_plot(plt_all, plt_residuals)
The examples with extreme residuals suggest that the linear model with lprice ~ lcarat + color + cut + clarity
may not have adequately effects of color and clarity.
diamonds2 <- diamonds2 %>%
add_predictions(mod_diamond2, "predicted_log_price") %>%
mutate(price_prediction = 2 ^ predicted_log_price)
diamonds2 %>% mutate(fractional_abs_residual = abs(price_prediction - price)/price) %>%
ggplot(aes(x = log10(price), y = fractional_abs_residual)) +
geom_boxplot(aes(group = cut_number(price, 20))) +
labs(title = "Absolute Values of the Residuals, as Fractions of the Prices") +
theme(plot.title = element_text(hjust = 0.5))
As indicated by the box plot, our model provides a useful indicator of the appropriate price, with the great majority strikingly close
Each of these days is the Sunday for a Monday public holiday.
daily %>%
top_n(3, resid)
#> # A tibble: 3 × 5
#> date n wday resid term
#> <date> <int> <ord> <dbl> <fctr>
#> 1 2013-11-30 857 Sat 112.4 fall
#> 2 2013-12-01 987 Sun 95.5 fall
#> 3 2013-12-28 814 Sat 69.4 fall
They represent heavy travel days around the Thanksgiving and Christmas holidays
I will compare the model under consideration here with the n ~ wday * term model, using facets.
daily <- flights %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarise(n = n())
daily <- daily %>%
mutate(wday = as.character(wday(date, label = TRUE)))
term <- function(date) {
cut(date,
breaks = ymd(20130101, 20130605, 20130825, 20140101),
labels = c("spring", "summer", "fall")
)
}
daily <- daily %>%
mutate(term = term(date))
daily <- daily %>%
mutate(sat_season_split = ifelse(wday == "Sat",
ifelse(term == "spring",
"Sat_spring",
ifelse(term == "summer",
"Sat_summer",
"Sat_fall")),
wday))
daily <- daily %>%
mutate(sat_season_split = as.factor(sat_season_split))
mod_sat_split <- lm(n ~ sat_season_split, data = daily)
mod_wday_term <- lm(n ~ wday*term, data = daily)
daily <- daily %>% gather_residuals(`Model with Saturday Split by School Term` = mod_sat_split,
`Model using the Product of Weekay and School Term` = mod_wday_term)
ggplot(daily) + geom_line(aes(x = date, y = resid)) + facet_wrap(~ model)
ggplot(daily) + geom_line(aes(x = date, y = resid, color = wday)) + facet_wrap(~ model)
The plots reveal little overall difference in closeness of fit (but with the differences over certain periods of the year). Likewise for the quantitative measures of goodness of fit, as per the following.
summary(mod_sat_split)
##
## Call:
## lm(formula = n ~ sat_season_split, data = daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -331.75 -8.81 11.31 23.64 141.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 967.462 6.568 147.308 < 2e-16 ***
## sat_season_splitMon 7.346 9.288 0.791 0.4295
## sat_season_splitSat_fall -251.462 12.951 -19.416 < 2e-16 ***
## sat_season_splitSat_spring -230.143 12.045 -19.107 < 2e-16 ***
## sat_season_splitSat_summer -166.545 15.167 -10.981 < 2e-16 ***
## sat_season_splitSun -75.981 9.288 -8.181 5.06e-15 ***
## sat_season_splitThu -1.712 9.288 -0.184 0.8539
## sat_season_splitTue -16.103 9.244 -1.742 0.0824 .
## sat_season_splitWed -4.769 9.288 -0.513 0.6079
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.36 on 356 degrees of freedom
## Multiple R-squared: 0.7357, Adjusted R-squared: 0.7297
## F-statistic: 123.8 on 8 and 356 DF, p-value: < 2.2e-16
summary(mod_wday_term)
##
## Call:
## lm(formula = n ~ wday * term, data = daily)
##
## Residuals:
## Min 1Q Median 3Q Max
## -326.33 -8.00 9.82 20.67 141.00
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 962.0000 9.8426 97.739 < 2e-16 ***
## wdayMon -0.7273 13.9195 -0.052 0.9584
## wdaySat -224.6818 13.9195 -16.142 < 2e-16 ***
## wdaySun -89.7727 13.9195 -6.449 3.81e-10 ***
## wdayThu 3.3636 13.9195 0.242 0.8092
## wdayTue -21.7391 13.7673 -1.579 0.1152
## wdayWed -10.3636 13.9195 -0.745 0.4571
## termsummer 17.8333 16.5675 1.076 0.2825
## termfall 3.8889 14.6724 0.265 0.7911
## wdayMon:termsummer 15.0758 23.7720 0.634 0.5264
## wdaySat:termsummer 45.7652 23.4300 1.953 0.0516 .
## wdaySun:termsummer 33.8485 23.7720 1.424 0.1554
## wdayThu:termsummer -8.6136 23.4300 -0.368 0.7134
## wdayTue:termsummer 29.9058 23.6833 1.263 0.2075
## wdayWed:termsummer 23.0303 23.4300 0.983 0.3263
## wdayMon:termfall 14.1015 20.5992 0.685 0.4941
## wdaySat:termfall -25.2071 20.7499 -1.215 0.2253
## wdaySun:termfall 18.8838 20.5992 0.917 0.3599
## wdayThu:termfall -8.9192 20.7499 -0.430 0.6676
## wdayTue:termfall -0.5708 20.4967 -0.028 0.9778
## wdayWed:termfall 0.8081 20.7499 0.039 0.9690
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.17 on 344 degrees of freedom
## Multiple R-squared: 0.7573, Adjusted R-squared: 0.7432
## F-statistic: 53.67 on 20 and 344 DF, p-value: < 2.2e-16
library(timeDate)
date(holidayNYSE(2013))
## [1] "2013-01-01" "2013-01-21" "2013-02-18" "2013-03-29" "2013-05-27"
## [6] "2013-07-04" "2013-09-02" "2013-11-28" "2013-12-25"
daily <- daily %>% select(-model)
daily <- daily %>% select(-resid)
daily <- daily %>% mutate(wday_term_holiday = ifelse(date %in% date(holidayNYSE(2013)),
"holiday",
as.character(sat_season_split)))
mod_sat_split
##
## Call:
## lm(formula = n ~ sat_season_split, data = daily)
##
## Coefficients:
## (Intercept) sat_season_splitMon
## 967.462 7.346
## sat_season_splitSat_fall sat_season_splitSat_spring
## -251.462 -230.143
## sat_season_splitSat_summer sat_season_splitSun
## -166.545 -75.981
## sat_season_splitThu sat_season_splitTue
## -1.712 -16.103
## sat_season_splitWed
## -4.769
mod_sat_split_holidays <- lm(n ~ wday_term_holiday, data = daily)
daily <- daily %>%
gather_residuals(`Holidays Considered` = mod_sat_split_holidays,
`Holidays not Considered` = mod_sat_split)
ggplot(daily) + geom_line(aes(date, resid)) + facet_wrap(~ model)
daily
As expected, the “Holidays Considered” model (with the variable that took into account whether a particular holiday is a public holiday) did not present an improvement. After all, for many of the holidays, the sharp effect is on a day before or after the holiday.
Likely overfitting of the model, which will contain many predicting combinations (e.g. Wednesday in June), each with very few examples. Testing this hypothesis:
daily <- daily %>% select(-resid, -model) %>%
mutate(month = month(date, label = TRUE))
mod_wday_month <- lm(n ~ wday * month, data = daily)
daily <- daily %>% gather_residuals(`Weeks Combined with Months` = mod_wday_month,
`Day of Week with Saturday split by School Term` = mod_sat_split)
ggplot(daily) + geom_line(aes(date, resid)) + facet_wrap(~ model)
ggplot(daily) + geom_line(aes(x = date, y = resid, color = wday)) + facet_wrap(~ model)
The fit of the model with the weekdays considered together with months is closer. However, the model would likely not be useful in making predictions for, say, flights for years other than 2013.
I might expect the model to fit a smooth trend for each of the weekdays (e.g. the plot for Saturdays would be smooth) but, like all of our models, sharply cyclical over the the weeks. The splines will roughly reflect the school terms, but will make for a smoother plot.
library(splines)
daily <- daily %>% select(-model, -resid)
mod_wday_plus_spline <- lm(n ~ wday + ns(date, 5), data = daily)
daily <- daily %>%
gather_predictions(`Weekdays Combined with Splines` = mod_wday_plus_spline,
`Day of Week with Saturday split by School Term` = mod_sat_split) %>%
mutate(resid = pred - n)
ggplot(daily) + geom_line(aes(x= date, y = pred, color = wday)) + facet_wrap(~ model)
The residuals, by day of the week:
ggplot(daily) + geom_line(aes(x = date, y = resid, color = wday)) + facet_wrap(~ model)
I tried various models and processed the residuals in various ways. The following plot, constructed with box plots and red dots indicating the mean residual distance for each hour, shows properties of the distribution of the the residuals for a model constructed with splines.
#combine the scheduled hour and minute values into a single number that accuratly represents the time, and then add the days of the week
flights_plus <- flights %>%
mutate(time = hour + minute * ( 1/ 60)) %>%
mutate(wday = wday(time_hour, label = TRUE))
model <- lm(distance ~ ns(time, 5), data = flights_plus)
flights_plus <- flights_plus %>%
gather_predictions(model) %>%
mutate(residual = pred - distance)
flights_plus %>% mutate(hour = as.factor(hour)) %>%
group_by(wday, hour) %>%
summarize(residual = mean(residual)) %>% #mean residual called "residual" for purposes of mapping
ggplot() + geom_point(aes(x = hour, y = residual),
color = "red", size = 2) +
geom_boxplot(data = (flights_plus %>% mutate(hour = as.factor(hour))),
aes(x = hour, y = residual),
inherit.aes = FALSE) + labs(y = "residual of the distance") +
facet_wrap(~ wday, ncol = 2)
In the above, Saturday shows noticeable differences from the other days in the distribution of the distances. However, for Sunday, other than its thus standing out in relevant respects from Saturday, I found no evidence that Sunday evening flights are to places further away than the evening flights during the rest of the week.
We can use the fct_shift
function.
#wrap the application of fct_shift() in a user-defined function,
weeks_reordered <- function(df) {
return (df %>% mutate(wday = fct_shift(wday, n = 1L)))
}
#apply the function to the data frame constructed in the answer to #7
flights_plus <- weeks_reordered(flights_plus)
#test the level order
levels(flights_plus$wday)
## [1] "Mon" "Tue" "Wed" "Thu" "Fri" "Sat" "Sun"