library(nycflights13)
flights %>% mutate(row_number = row_number())
## # A tibble: 336,776 x 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2.00 830
## 2 2013 1 1 533 529 4.00 850
## 3 2013 1 1 542 540 2.00 923
## 4 2013 1 1 544 545 -1.00 1004
## 5 2013 1 1 554 600 -6.00 812
## 6 2013 1 1 554 558 -4.00 740
## 7 2013 1 1 555 600 -5.00 913
## 8 2013 1 1 557 600 -3.00 709
## 9 2013 1 1 557 600 -3.00 838
## 10 2013 1 1 558 600 -2.00 753
## # ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, row_number <int>
babynames::babynames
library(babynames)
babynames
## # A tibble: 1,858,689 x 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 1880 F Mary 7065 0.0724
## 2 1880 F Anna 2604 0.0267
## 3 1880 F Emma 2003 0.0205
## 4 1880 F Elizabeth 1939 0.0199
## 5 1880 F Minnie 1746 0.0179
## 6 1880 F Margaret 1578 0.0162
## 7 1880 F Ida 1472 0.0151
## 8 1880 F Alice 1414 0.0145
## 9 1880 F Bertha 1320 0.0135
## 10 1880 F Sarah 1288 0.0132
## # ... with 1,858,679 more rows
babynames %>%
count(name, n, prop) %>%
filter(nn > 1)
## # A tibble: 0 x 4
## # ... with 4 variables: name <chr>, n <int>, prop <dbl>, nn <int>
We thus see that (name, n, prop) specifies a key.
nasaweather::atmos
library(nasaweather)
##
## Attaching package: 'nasaweather'
## The following object is masked from 'package:dplyr':
##
## storms
atmos
## # A tibble: 41,472 x 11
## lat long year month surftemp temp pressure ozone cloudlow cloudmid
## <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 36.2 -114 1995 1 273 272 835 304 7.50 34.5
## 2 33.7 -114 1995 1 280 282 940 304 11.5 32.5
## 3 31.2 -114 1995 1 285 285 960 298 16.5 26.0
## 4 28.7 -114 1995 1 289 291 990 276 20.5 14.5
## 5 26.2 -114 1995 1 292 293 1000 274 26.0 10.5
## 6 23.7 -114 1995 1 294 294 1000 264 30.0 9.50
## 7 21.2 -114 1995 1 295 295 1000 258 29.5 11.0
## 8 18.7 -114 1995 1 298 297 1000 252 26.5 17.5
## 9 16.2 -114 1995 1 300 298 1000 250 27.5 18.5
## 10 13.7 -114 1995 1 300 299 1000 250 26.0 16.5
## # ... with 41,462 more rows, and 1 more variable: cloudhigh <dbl>
atmos %>%
count(lat, long, year, month) %>%
filter(n > 1)
## # A tibble: 0 x 5
## # ... with 5 variables: lat <dbl>, long <dbl>, year <int>, month <int>,
## # n <int>
We thus see that (lat, long, year, month) specifies a key.
fueleconomy::vehicles
library(fueleconomy)
v <- vehicles
vehicles %>%
count(id) %>%
filter(n > 1)
## # A tibble: 0 x 2
## # ... with 2 variables: id <int>, n <int>
We thus see that the variable id
is a key.
ggplot2::diamonds
library(ggplot2)
diamonds
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.230 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
## 2 0.210 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
## 3 0.230 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
## 5 0.310 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
## 6 0.240 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48
## 7 0.240 Very Good I VVS1 62.3 57.0 336 3.95 3.98 2.47
## 8 0.260 Very Good H SI1 61.9 55.0 337 4.07 4.11 2.53
## 9 0.220 Fair E VS2 65.1 61.0 337 3.87 3.78 2.49
## 10 0.230 Very Good H VS1 59.4 61.0 338 4.00 4.05 2.39
## # ... with 53,930 more rows
diamonds %>%
count(x, y, z, price, depth, table, clarity, carat, color, clarity) %>%
filter(n > 1)
## # A tibble: 149 x 10
## x y z price depth table clarity carat color n
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <ord> <dbl> <ord> <int>
## 1 0 0 0 2130 64.1 60.0 SI2 0.710 F 2
## 2 4.23 4.26 2.69 394 63.4 57.0 VS1 0.300 J 2
## 3 4.26 4.23 2.69 506 63.4 57.0 VS1 0.300 J 2
## 4 4.26 4.29 2.66 450 62.2 57.0 SI1 0.300 H 2
## 5 4.27 4.28 2.66 450 62.2 57.0 SI1 0.300 H 2
## 6 4.27 4.30 2.66 862 62.1 57.0 VVS1 0.300 E 2
## 7 4.28 4.31 2.67 552 62.2 58.0 SI1 0.300 D 2
## 8 4.29 4.31 2.71 526 63.0 55.0 VS2 0.300 G 2
## 9 4.29 4.31 2.73 571 63.5 56.0 SI1 0.310 D 2
## 10 4.31 4.28 2.67 709 62.2 58.0 SI1 0.300 D 2
## # ... with 139 more rows
In the diamonds dataset, grouping by all of the keys as above leaves 149 rows with more than one member. The dataset thus contains no key.
ave_delay <- flights %>%
group_by(dest) %>%
summarize(average = mean(arr_delay, na.rm = TRUE))
airport_distribution <- ave_delay %>% left_join(airports, by = c("dest" = "faa"))
airport_distribution %>%
filter(!is.na(lat)) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point(aes(color = average)) +
coord_quickmap() +
labs(color = "average\ndelay in\nminutes")
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
flights
.airports_select <- airports %>% select(faa, lat, lon)
temp <- flights %>%
left_join(airports_select, by = c("origin" = "faa")) %>%
rename(origin_lat = lat, origin_lon = lon) %>%
left_join(airports_select, by = c("dest" = "faa")) %>%
rename(dest_lat = lat, dest_lon = lon)
temp
## # A tibble: 336,776 x 23
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2.00 830
## 2 2013 1 1 533 529 4.00 850
## 3 2013 1 1 542 540 2.00 923
## 4 2013 1 1 544 545 -1.00 1004
## 5 2013 1 1 554 600 -6.00 812
## 6 2013 1 1 554 558 -4.00 740
## 7 2013 1 1 555 600 -5.00 913
## 8 2013 1 1 557 600 -3.00 709
## 9 2013 1 1 557 600 -3.00 838
## 10 2013 1 1 558 600 -2.00 753
## # ... with 336,766 more rows, and 16 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, origin_lat <dbl>, origin_lon <dbl>,
## # dest_lat <dbl>, dest_lon <dbl>
#find the average arrival delay for each plane
plane_join <- flights %>%
inner_join(planes, by = "tailnum")
delay_stats_year <- plane_join %>%
group_by(year.y) %>%
summarize(mean = mean(arr_delay, na.rm = TRUE),
`50th percentile` = median(arr_delay, na.rm = TRUE),
`25th percentile` = quantile(arr_delay, probs = .25, na.rm = TRUE),
`75th percentile` = quantile(arr_delay, probs = .75, na.rm = TRUE),
`10th percentile` = quantile(arr_delay, probs = .10, na.rm = TRUE),
`90th percentile` = quantile(arr_delay, probs = .90, na.rm = TRUE))
stacked_stats <- delay_stats_year %>%
gather(`90th percentile`, `75th percentile`, `50th percentile`, `25th percentile`,
`10th percentile`, `mean`,
key = "statistic", value = "value")
ggplot(stacked_stats) +
geom_line(aes(x = 2013 - year.y, y = value,
color = statistic)) +
scale_color_discrete(breaks = c("90th percentile", "75th percentile", "mean",
"50th percentile", "25th percentile", "10th percentile")) +
labs(x = "age in years", y = "delay in minutes",
title = "Flight Delays by Year of Manufacture of Plane") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 6 rows containing missing values (geom_path).
The bump in the mean and at the upper percentiles of this plot, peaking at perhaps 8 and 9 years of age, is interesting. It appears that at least a significant number of the planes may have become less reliable over the beginning of the first decade of use, and then become more reliable (supposing, of course, that the data reflect the condition of the planes and not, say, the selection of certain models to be used in certain weather conditions).
The above data is by individual flight. Let’s look at the averages for the individual planes.
plane_delay_averages <- flights %>%
group_by(tailnum) %>%
summarize(average_arr_delay = mean(arr_delay, na.rm = TRUE))
plane_join <- plane_delay_averages %>%
inner_join(planes, by = "tailnum")
delay_stats_year <- plane_join %>%
group_by(year) %>%
summarize(mean = mean(average_arr_delay, na.rm = TRUE),
`50th percentile` = median(average_arr_delay, na.rm = TRUE),
`25th percentile` = quantile(average_arr_delay, probs = .25, na.rm = TRUE),
`75th percentile` = quantile(average_arr_delay, probs = .75, na.rm = TRUE),
`10th percentile` = quantile(average_arr_delay, probs = .10, na.rm = TRUE),
`90th percentile` = quantile(average_arr_delay, probs = .90, na.rm = TRUE))
stacked_stats <- delay_stats_year %>%
gather(`90th percentile`, `75th percentile`, `50th percentile`, `25th percentile`,
`10th percentile`, `mean`,
key = "statistic", value = "value")
ggplot(stacked_stats) +
geom_line(aes(x = 2013 - year, y = value,
color = statistic)) +
scale_color_discrete(breaks = c("90th percentile", "75th percentile", "mean",
"50th percentile", "25th percentile", "10th percentile")) +
labs(x = "age in years", y = "average delay in minutes",
title = "Flight Delay Averages by Year of Manufacture of the Plane") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 6 rows containing missing values (geom_path).
The data for the average arrival delay for each plane is consistent with the suspicion, raised in connection with the previous plot, that, initially, the delays become somewhat greater as the planes become older, until the planes are just short of 10 years. After that, there may be a week trend towards fewer delays as the planes get older. The sudden change in the behavior of the plot after perhaps 28 years is also interesting. It might reflect a practice of taking most of the planes out of service at this relatively advanced age. Does the data contain other evidence for this hypothesis?
older_plane_investigation <- flights %>%
inner_join(planes, by = "tailnum") %>%
group_by(year.y) %>%
count(tailnum) %>%
count(year.y)
ggplot(older_plane_investigation %>% na.omit) +
geom_point(aes(x = 2013 - year.y, y = nn)) +
labs(x = "age in years", y = "number of planes with at least one departure, if any",
title = "Numbers of Planes Departing from New York City Airports") +
theme(plot.title = element_text(hjust = 0.5))
As suggested in the Flight Delay Averages plot, we see from the above that the number of planes in use, for departures from New York City, falls precipitously as the planes approach 30 years in age.
#consider various averages over hours (all of the hours, in the year, reflected in the data)
weather_by_flight_by_hour <- flights %>%
left_join(weather, by = c("year", "month", "day", "hour", "origin")) %>%
group_by(year, month, day, hour, origin, temp, dewp, humid,
wind_dir, wind_speed, wind_gust, precip, pressure, visib) %>%
summarize(ave_dep_delay = mean(dep_delay, na.rm = TRUE),
n = n(),
proportion_cancelled = sum(is.na(dep_delay)) / n)
#To confirm some suggestions from plots of the values calculated in the above, we consider averages of delays over various values of the weather variables (or, if useful, intervals of such values)
weather_by_flight <- flights %>%
left_join(weather, by = c("year", "month", "day", "hour", "origin")) %>%
group_by(visib) %>%
summarize(ave_dep_delay = mean(dep_delay, na.rm = TRUE),
`number_of_cases / 10,000` = n() / 10000,
percentage_cancelled = 100 * sum(is.na(dep_delay)) / n())
#put data to be plotted into a single column, to make it "tidy," and then plot it
weather_by_flight %>% na.omit() %>%
gather(ave_dep_delay, percentage_cancelled, `number_of_cases / 10,000`,
key = "statistic", value = "value") %>%
ggplot() + geom_point(aes(x = visib, y = value, color = statistic)) +
labs(x = "visibility (with \"10\" standing for 10 or greater)",
title = "Airport Departure Statistics, by Visibility") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.y = element_blank())
As visibility increases, we see a decrease in both cancellations and the delays. However, given that the vast majority of cases rest in the visibility category of 10 or greater, some guardedness in our conclusions, based on the data in the nycflights13
package, is warranted.
ave_delay_6_13 <- flights %>%
filter(year == 2013 & month == 6 & day ==13) %>%
group_by(dest) %>%
summarize(average = mean(arr_delay, na.rm = TRUE))
airport_distribution_6_13 <- ave_delay_6_13 %>% left_join(airports, by = c("dest" = "faa"))
airport_distribution_6_13 %>%
filter(!is.na(lat) & lon > -130) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point(aes(color = average)) +
coord_quickmap() +
labs(color = "average\ndelay in\nminutes",
title = "Arrival Delays for Flights from New York, June 13, 2013") +
theme(plot.title = element_text(hjust = .5))
There were notable delays in Chattanooga, Tennessee and Birmingham, Alabama. These may have been due to severe storms called decheros. See https://www.weatherworksinc.com/June-12-13-Derechoes .
with_tailnum <- flights %>%
left_join(planes, by = "tailnum") %>%
count(carrier)
without_tailnum <- flights %>%
anti_join(planes, by = "tailnum") %>%
count(carrier)
comparison <- with_tailnum %>%
left_join(without_tailnum, by = "carrier")
#check to see whether the `carrier` variable has `NA`s. (It doesn't)
flights %>% filter(is.na(carrier))
## # A tibble: 0 x 19
## # ... with 19 variables: year <int>, month <int>, day <int>,
## # dep_time <int>, sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## # sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
comparison
## # A tibble: 16 x 3
## carrier n.x n.y
## <chr> <int> <int>
## 1 9E 18460 1044
## 2 AA 32729 22558
## 3 AS 714 NA
## 4 B6 54635 830
## 5 DL 48110 110
## 6 EV 54173 NA
## 7 F9 685 50
## 8 FL 3260 187
## 9 HA 342 NA
## 10 MQ 26397 25397
## 11 OO 32 NA
## 12 UA 58665 1693
## 13 US 20536 699
## 14 VX 5162 NA
## 15 WN 12275 38
## 16 YV 601 NA
Column n.x
reflects all of the data in the flights
dataframe, whereas n.y
reflects flights for which tail number information is missing. We see that MQ (Envoy Airlines) and AA (American Airlines) are responsible for most of the missing tail-number data.
flights
to only show flights with planes that have flown at least 100 flights.at_least_hundred <- flights %>%
filter(!is.na(arr_delay)) %>% #avoid counting cancelled flights
group_by(tailnum) %>%
mutate(n = n()) %>%
ungroup() %>%
filter(n >= 100)
at_least_hundred
## # A tibble: 221,908 x 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2.00 830
## 2 2013 1 1 533 529 4.00 850
## 3 2013 1 1 544 545 -1.00 1004
## 4 2013 1 1 554 558 -4.00 740
## 5 2013 1 1 555 600 -5.00 913
## 6 2013 1 1 557 600 -3.00 709
## 7 2013 1 1 557 600 -3.00 838
## 8 2013 1 1 558 600 -2.00 849
## 9 2013 1 1 558 600 -2.00 853
## 10 2013 1 1 558 600 -2.00 923
## # ... with 221,898 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, n <int>
The above may be the best answer if the intention is to show only the flights that were not cancelled. Otherwise, the following should work.
temp <- flights %>%
filter(!is.na(arr_delay)) %>% #to avoid counting cancelled flights has having occurred
count(tailnum) %>%
filter(n >= 100)
#bring back the unfiltered dataframe, so as to count all scheduled flights (whether
#cancelled or not)
second_method <- flights %>%
semi_join(temp, by = "tailnum")
second_method
## # A tibble: 227,170 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2.00 830
## 2 2013 1 1 533 529 4.00 850
## 3 2013 1 1 544 545 -1.00 1004
## 4 2013 1 1 554 558 -4.00 740
## 5 2013 1 1 555 600 -5.00 913
## 6 2013 1 1 557 600 -3.00 709
## 7 2013 1 1 557 600 -3.00 838
## 8 2013 1 1 558 600 -2.00 849
## 9 2013 1 1 558 600 -2.00 853
## 10 2013 1 1 558 600 -2.00 923
## # ... with 227,160 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
fueleconomy::vehicles
and fueleconomy::common
to find only the records for the most common models.We can think of vehicle models at various levels specificity. When a certain car is rebranded under a different name (say, a Chevrolet branded as a GM), is it the same “model”? To see the the extent of this sort of thing (along with, probably, some other issues that could go into whether the variable model
uniquely identifies vehicle models), consider the following calculation, which uncovers the values of model
that have more than one corresponding vehicle make.
library(fueleconomy)
vehicles %>%
count(model, make) %>%
group_by(model) %>%
mutate(`number of corresponding makes` = n()) %>%
filter(`number of corresponding makes` > 1)
## # A tibble: 126 x 4
## # Groups: model [60]
## model make n `number of correspond…
## <chr> <chr> <int> <int>
## 1 200 Audi 4 2
## 2 200 Chrysler 14 2
## 3 240 DL/240 GL Wagon Mcevoy Motors 2 2
## 4 240 DL/240 GL Wagon Volvo 10 2
## 5 300E Lambda Control Systems 1 2
## 6 300E Mercedes-Benz 11 2
## 7 300SL J.K. Motors 2 2
## 8 300SL Mercedes-Benz 9 2
## 9 500SE Mercedes-Benz 3 2
## 10 500SE Texas Coach Company 1 2
## # ... with 116 more rows
In most but not all of the cases, the two or three models with the different entries for make
are significantly different. I’ll thus assume that two vehicles are different models only if the corresponding values of both model
and make
are different. Thus restricting the information in vehicles
to the “common” models (those reflected in the common
dataframe):
vehicles %>%
semi_join(common, by = c("make", "model"))
## # A tibble: 14,531 x 12
## id make model year class trans drive cyl displ fuel hwy cty
## <int> <chr> <chr> <int> <chr> <chr> <chr> <int> <dbl> <chr> <int> <int>
## 1 1833 Acura Inte… 1986 Subc… Auto… Fron… 4 1.60 Regu… 28 22
## 2 1834 Acura Inte… 1986 Subc… Manu… Fron… 4 1.60 Regu… 28 23
## 3 3037 Acura Inte… 1987 Subc… Auto… Fron… 4 1.60 Regu… 28 22
## 4 3038 Acura Inte… 1987 Subc… Manu… Fron… 4 1.60 Regu… 28 23
## 5 4183 Acura Inte… 1988 Subc… Auto… Fron… 4 1.60 Regu… 27 22
## 6 4184 Acura Inte… 1988 Subc… Manu… Fron… 4 1.60 Regu… 28 23
## 7 5303 Acura Inte… 1989 Subc… Auto… Fron… 4 1.60 Regu… 27 22
## 8 5304 Acura Inte… 1989 Subc… Manu… Fron… 4 1.60 Regu… 28 23
## 9 6442 Acura Inte… 1990 Subc… Auto… Fron… 4 1.80 Regu… 24 20
## 10 6443 Acura Inte… 1990 Subc… Manu… Fron… 4 1.80 Regu… 26 21
## # ... with 14,521 more rows
This question could pertain to 48-hour periods or, instead, to sets of 48 one-hour periods. Considering the second of these readings, and supposing that the periods with the worst delays are the periods with the greatest average of all delays,
average_delays <-
flights %>%
group_by(year, month, day, hour) %>%
summarize(average_delay = mean(arr_delay)) %>%
ungroup()
average_delay <- arrange(average_delays, desc(average_delay))
top_48 <- head(average_delay, n = 48L)
top_48_weather <- top_48 %>%
left_join(weather, by = c("year", "month", "day", "hour"))
average_delays_weather <- average_delays %>%
left_join(weather, by = c("year", "month", "day", "hour"))
ave_48 <- map_dbl(top_48_weather, mean, na.rm = TRUE)
## Warning in mean.default(.x[[i]], ...): argument is not numeric or logical:
## returning NA
ave_all <- map_dbl(average_delays_weather, mean, na.rm = TRUE)
## Warning in mean.default(.x[[i]], ...): argument is not numeric or logical:
## returning NA
difference <- ave_48 - ave_all #find the differences in the mean, where this makes sense
difference
## year month day hour average_delay
## 0.000000e+00 6.812485e-02 2.238777e+00 7.956791e+00 7.616687e+01
## origin temp dewp humid wind_dir
## NA 1.120726e+01 1.636398e+01 1.390006e+01 -2.581974e+01
## wind_speed wind_gust precip pressure visib
## -4.665607e-01 -5.369088e-01 3.158352e-03 -4.334032e+00 -1.584302e+00
## time_hour
## 3.892214e+05
The above shows the differences between the column means for the 48 hours with the greatest delays and the column means for all of the flights (for all of the hours of the year). As one might expect, for the 48 hours with the greatest delays, the average visibility and pressure were somewhat lesser, while the average precipitation was somewhat greater.
The dataframe anti_join(flights, airports, by = c("dest" = "faa"))
identifies the flights in the flights
dataframe for which the destination is not listed in the airports
dataframe. Both of the flights and the airports dataframe are supposed to cover domestic flights, and airports, only. So it may seem that result of the “anti-join” include fights to domestic airports for which data in the airports
dataframe is missing. Let’s look.
anti_join(flights, airports, by = c("dest" = "faa")) %>%
count(dest)
## # A tibble: 4 x 2
## dest n
## <chr> <int>
## 1 BQN 896
## 2 PSE 365
## 3 SJU 5819
## 4 STT 522
All these airports are in US territories (that are not states). Rather than simply indicating missing data in the airports dataframe, the result of the anti-join may indicate an ambiguity with respect to the meaning of “domestic,” with the airports
dataframe constructed with a somewhat more restrictive interpretation.
anti_join(airports, flights, by = c("faa" = "dest")) %>%
count(faa)
## # A tibble: 1,357 x 2
## faa n
## <chr> <int>
## 1 04G 1
## 2 06A 1
## 3 06C 1
## 4 06N 1
## 5 09J 1
## 6 0A9 1
## 7 0G6 1
## 8 0G7 1
## 9 0P2 1
## 10 0S9 1
## # ... with 1,347 more rows
It appears that there were 1,357 domestic airports reflected in the airports data to which there were no flights (as per the flights
data) from New York.
We take a quick look at the headings and initial values of some possibly relevant data frames.
glimpse(airlines)
## Observations: 16
## Variables: 2
## $ carrier <chr> "9E", "AA", "AS", "B6", "DL", "EV", "F9", "FL", "HA", ...
## $ name <chr> "Endeavor Air Inc.", "American Airlines Inc.", "Alaska...
glimpse(planes)
## Observations: 3,322
## Variables: 9
## $ tailnum <chr> "N10156", "N102UW", "N103US", "N104UW", "N10575",...
## $ year <int> 2004, 1998, 1999, 1999, 2002, 1999, 1999, 1999, 1...
## $ type <chr> "Fixed wing multi engine", "Fixed wing multi engi...
## $ manufacturer <chr> "EMBRAER", "AIRBUS INDUSTRIE", "AIRBUS INDUSTRIE"...
## $ model <chr> "EMB-145XR", "A320-214", "A320-214", "A320-214", ...
## $ engines <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ seats <int> 55, 182, 182, 182, 55, 182, 182, 182, 182, 182, 5...
## $ speed <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ engine <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan...
glimpse(flights)
## Observations: 336,776
## Variables: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,...
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 55...
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 60...
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2...
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 7...
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 7...
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -...
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV",...
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79...
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN...
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR"...
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL"...
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138...
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 94...
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5,...
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013...
It looks like the flights
dataframe may be sufficient for making this determination.
fl <- flights %>%
count(tailnum, carrier) %>%
count(tailnum) %>%
filter(nn > 1)
fl
## # A tibble: 18 x 2
## tailnum nn
## <chr> <int>
## 1 N146PQ 2
## 2 N153PQ 2
## 3 N176PQ 2
## 4 N181PQ 2
## 5 N197PQ 2
## 6 N200PQ 2
## 7 N228PQ 2
## 8 N232PQ 2
## 9 N933AT 2
## 10 N935AT 2
## 11 N977AT 2
## 12 N978AT 2
## 13 N979AT 2
## 14 N981AT 2
## 15 N989AT 2
## 16 N990AT 2
## 17 N994AT 2
## 18 <NA> 7
It would thus seem that at least 17 planes were used more by than one airline. Perhaps this reflects the fact that the same journey can be listed for more than one airline? Let’s include the airlines in this information, using semi_join
, and see if we find, for these planes, matching flight numbers for two carriers.
flj <- flights %>%
semi_join(fl, key = "tailnum") %>%
count(tailnum, carrier) %>%
na.omit()
## Joining, by = "tailnum"
flights %>%
semi_join(flj, by = c("tailnum", "carrier")) %>%
count(carrier, tailnum, flight) %>%
count(tailnum, flight) %>%
filter(nn > 1)
## # A tibble: 0 x 3
## # ... with 3 variables: tailnum <chr>, flight <int>, nn <int>
Because the result here is zero rows, we see that, for the tail numbers identified in fl
, the combination of tail number and flight is sufficient to identify the carrier. The answer to the question I posed above is thus “no.”