This is an R script ported from here.
# Load packages
library("dplyr")
library("ggplot2")
library("nycflights13")
library("lubridate")
# it's a data.frame, but also a tbl_df.
# doesn't print entire thing to screen.
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
class(flights)
## [1] "tbl_df" "tbl" "data.frame"
weather
## # A tibble: 26,130 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 0 37.04 21.92 53.97 230 10.35702
## 2 EWR 2013 1 1 1 37.04 21.92 53.97 230 13.80936
## 3 EWR 2013 1 1 2 37.94 21.92 52.09 230 12.65858
## 4 EWR 2013 1 1 3 37.94 23.00 54.51 230 13.80936
## 5 EWR 2013 1 1 4 37.94 24.08 57.04 240 14.96014
## 6 EWR 2013 1 1 6 39.02 26.06 59.37 270 10.35702
## 7 EWR 2013 1 1 7 39.02 26.96 61.63 250 8.05546
## 8 EWR 2013 1 1 8 39.02 28.04 64.43 240 11.50780
## 9 EWR 2013 1 1 9 39.92 28.04 62.21 250 12.65858
## 10 EWR 2013 1 1 10 39.02 28.04 64.43 260 12.65858
## # ... with 26,120 more rows, and 5 more variables: wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
planes
## # A tibble: 3,322 x 9
## tailnum year type manufacturer model
## <chr> <int> <chr> <chr> <chr>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR
## 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 7 N107US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 8 N108UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 9 N109UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 10 N110UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## # ... with 3,312 more rows, and 4 more variables: engines <int>,
## # seats <int>, speed <int>, engine <chr>
airports
## # A tibble: 1,458 x 8
## faa name lat lon alt tz
## <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5
## 2 06A Moton Field Municipal Airport 32.46057 -85.68003 264 -6
## 3 06C Schaumburg Regional 41.98934 -88.10124 801 -6
## 4 06N Randall Airport 41.43191 -74.39156 523 -5
## 5 09J Jekyll Island Airport 31.07447 -81.42778 11 -5
## 6 0A9 Elizabethton Municipal Airport 36.37122 -82.17342 1593 -5
## 7 0G6 Williams County Airport 41.46731 -84.50678 730 -5
## 8 0G7 Finger Lakes Regional Airport 42.88356 -76.78123 492 -5
## 9 0P2 Shoestring Aviation Airfield 39.79482 -76.64719 1000 -5
## 10 0S9 Jefferson County Intl 48.05381 -122.81064 108 -8
## # ... with 1,448 more rows, and 2 more variables: dst <chr>, tzone <chr>
airlines
## # A tibble: 16 x 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
# dplyr also gives you verbs. All take a tbl_df as first argument.
# on their own, not much that base R can't do.
## select particular variables from flights
select(flights, year, month, day)
## # A tibble: 336,776 x 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 1
## 3 2013 1 1
## 4 2013 1 1
## 5 2013 1 1
## 6 2013 1 1
## 7 2013 1 1
## 8 2013 1 1
## 9 2013 1 1
## 10 2013 1 1
## # ... with 336,766 more rows
## filter based on some condition. all fights taken by this plane.
filter(flights, tailnum=="N14228")
## # A tibble: 111 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 8 1435 1440 -5 1717
## 3 2013 1 9 717 700 17 812
## 4 2013 1 9 1143 1144 -1 1425
## 5 2013 1 13 835 824 11 1030
## 6 2013 1 16 1829 1730 59 2117
## 7 2013 1 22 1902 1808 54 2214
## 8 2013 1 23 1050 1056 -6 1143
## 9 2013 1 23 1533 1529 4 1641
## 10 2013 1 25 724 720 4 1000
## # ... with 101 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
## mutate adds new columns. time made up in air:
mutate(flights, madeup=dep_delay-arr_delay)
## # A tibble: 336,776 x 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, madeup <dbl>
## summarize reduces grouped data to a single row.
summarize(flights, avgdelay=mean(arr_delay, na.rm=TRUE))
## # A tibble: 1 x 1
## avgdelay
## <dbl>
## 1 6.895377
## group_by turns existing table into grouped_df class
## summary operations are performed by the group
group_by(flights, dest)
## # A tibble: 336,776 x 19
## # Groups: dest [105]
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
summarize(group_by(flights, dest), avgdelay=mean(arr_delay, na.rm=TRUE))
## # A tibble: 105 x 2
## dest avgdelay
## <chr> <dbl>
## 1 ABQ 4.381890
## 2 ACK 4.852273
## 3 ALB 14.397129
## 4 ANC -2.500000
## 5 ATL 11.300113
## 6 AUS 6.019909
## 7 AVL 8.003831
## 8 BDL 7.048544
## 9 BGR 8.027933
## 10 BHM 16.877323
## # ... with 95 more rows
# Combining
# 1. flights dataset
# 2. mutate to add "madeup" variable
# 3. filter it where madeup>60
# 4. select particular columns
# %>% as "then"
select(
filter(
mutate(flights,
madeup=dep_delay-arr_delay
), madeup>60
), dep_delay, arr_delay, dest, madeup
)
## # A tibble: 154 x 4
## dep_delay arr_delay dest madeup
## <dbl> <dbl> <chr> <dbl>
## 1 -4 -65 LAX 61
## 2 65 1 OAK 64
## 3 12 -57 SFO 69
## 4 -4 -70 SFO 66
## 5 121 57 LAX 64
## 6 9 -63 LAX 72
## 7 46 -16 LAX 62
## 8 37 -25 SFO 62
## 9 -3 -68 LAS 65
## 10 31 -42 SFO 73
## # ... with 144 more rows
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
mutate(madeup=dep_delay-arr_delay) %>%
filter(madeup>60) %>%
select(dep_delay, arr_delay, dest, madeup)
## # A tibble: 154 x 4
## dep_delay arr_delay dest madeup
## <dbl> <dbl> <chr> <dbl>
## 1 -4 -65 LAX 61
## 2 65 1 OAK 64
## 3 12 -57 SFO 69
## 4 -4 -70 SFO 66
## 5 121 57 LAX 64
## 6 9 -63 LAX 72
## 7 46 -16 LAX 62
## 8 37 -25 SFO 62
## 9 -3 -68 LAS 65
## 10 31 -42 SFO 73
## # ... with 144 more rows
# add a date to flights
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights <- flights %>%
mutate(date=ymd(paste(year, month, day, sep="-"))) %>%
na.omit()
flights
## # A tibble: 327,346 x 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 327,336 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, date <date>
flights %>% select(date)
## # A tibble: 327,346 x 1
## date
## <date>
## 1 2013-01-01
## 2 2013-01-01
## 3 2013-01-01
## 4 2013-01-01
## 5 2013-01-01
## 6 2013-01-01
## 7 2013-01-01
## 8 2013-01-01
## 9 2013-01-01
## 10 2013-01-01
## # ... with 327,336 more rows
class(flights$date)
## [1] "Date"
# How many flights departed each day?
flights %>% group_by(date) %>% summarize(n=n())
## # A tibble: 365 x 2
## date n
## <date> <int>
## 1 2013-01-01 831
## 2 2013-01-02 928
## 3 2013-01-03 900
## 4 2013-01-04 908
## 5 2013-01-05 717
## 6 2013-01-06 829
## 7 2013-01-07 930
## 8 2013-01-08 892
## 9 2013-01-09 893
## 10 2013-01-10 929
## # ... with 355 more rows
# How many each day from each origin?
flights %>% group_by(date, origin) %>% summarize(n=n())
## # A tibble: 1,095 x 3
## # Groups: date [?]
## date origin n
## <date> <chr> <int>
## 1 2013-01-01 EWR 300
## 2 2013-01-01 JFK 295
## 3 2013-01-01 LGA 236
## 4 2013-01-02 EWR 341
## 5 2013-01-02 JFK 317
## 6 2013-01-02 LGA 270
## 7 2013-01-03 EWR 331
## 8 2013-01-03 JFK 317
## 9 2013-01-03 LGA 252
## 10 2013-01-04 EWR 337
## # ... with 1,085 more rows
# how many by week by origin?
flights %>% group_by(week(date), origin) %>% summarize(n=n())
## # A tibble: 159 x 3
## # Groups: week(date) [?]
## `week(date)` origin n
## <dbl> <chr> <int>
## 1 1 EWR 2187
## 2 1 JFK 2157
## 3 1 LGA 1699
## 4 2 EWR 2208
## 5 2 JFK 2043
## 6 2 LGA 1791
## 7 3 EWR 2170
## 8 3 JFK 1997
## 9 3 LGA 1746
## 10 4 EWR 2129
## # ... with 149 more rows
# we can name a variable in the resulting tbl_df
flights %>% group_by(week=week(date), origin) %>% summarize(n=n())
## # A tibble: 159 x 3
## # Groups: week [?]
## week origin n
## <dbl> <chr> <int>
## 1 1 EWR 2187
## 2 1 JFK 2157
## 3 1 LGA 1699
## 4 2 EWR 2208
## 5 2 JFK 2043
## 6 2 LGA 1791
## 7 3 EWR 2170
## 8 3 JFK 1997
## 9 3 LGA 1746
## 10 4 EWR 2129
## # ... with 149 more rows
# let's create new data grouped by week and destination
# showing number of flights, average delays, and number distinct destinations
byweek <- flights %>%
group_by(week=week(date), origin) %>%
summarize(n=n(), delay=mean(arr_delay), ndests=n_distinct(dest))
byweek
## # A tibble: 159 x 5
## # Groups: week [?]
## week origin n delay ndests
## <dbl> <chr> <int> <dbl> <int>
## 1 1 EWR 2187 9.0740741 82
## 2 1 JFK 2157 0.2814094 60
## 3 1 LGA 1699 1.8022366 44
## 4 2 EWR 2208 3.1077899 80
## 5 2 JFK 2043 -2.2623593 59
## 6 2 LGA 1791 -4.8330542 40
## 7 3 EWR 2170 15.0589862 80
## 8 3 JFK 1997 -1.5578368 58
## 9 3 LGA 1746 6.0337915 39
## 10 4 EWR 2129 18.7745420 80
## # ... with 149 more rows
# plot number of flights by week by origin city
byweek %>% ggplot(aes(week, n, colour=origin)) + geom_line()
# why the dip at the end?
# let's get total number of flights by week
byweek %>% summarize(sum(n))
## # A tibble: 53 x 2
## week `sum(n)`
## <dbl> <int>
## 1 1 6043
## 2 2 6042
## 3 3 5913
## 4 4 5894
## 5 5 5803
## 6 6 5089
## 7 7 6190
## 8 8 6282
## 9 9 6315
## 10 10 6031
## # ... with 43 more rows
# only 364 days in year, Dec 31 is probably day 1 of week 53
52*7
## [1] 364
# same plot but not looking at week 53
byweek %>%
filter(week<=52) %>%
ggplot(aes(week, n, colour=origin)) + geom_line()
ymd("2013-01-01") + weeks(21) # mem day
## [1] "2013-05-28"
ymd("2013-01-01") + weeks(26) # july 4
## [1] "2013-07-02"
ymd("2013-01-01") + weeks(35) # labor day. why JFK continues decline?
## [1] "2013-09-03"
ymd("2013-01-01") + weeks(47) # TG, xmas, NY
## [1] "2013-11-26"
# why JFK lose number after labor day?
# LGA added a LOT more destinations just before this decline.
# makes more people fly out of there as opposed to JFK!
byweek %>%
filter(week<=52) %>%
ggplot(aes(week, ndests, colour=origin)) + geom_line()
# What causes delays?
# do older planes have more delays?
flights
## # A tibble: 327,346 x 20
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with 327,336 more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, date <date>
planes
## # A tibble: 3,322 x 9
## tailnum year type manufacturer model
## <chr> <int> <chr> <chr> <chr>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR
## 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 7 N107US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 8 N108UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 9 N109UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 10 N110UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## # ... with 3,312 more rows, and 4 more variables: engines <int>,
## # seats <int>, speed <int>, engine <chr>
flights %>%
inner_join(planes, by="tailnum") %>% # explain year.x, .y
group_by(year.y) %>%
summarize(delay=mean(arr_delay)) %>%
filter(!is.na(year.y)) %>%
ggplot(aes(year.y, delay)) + geom_point() + geom_smooth(method="loess")
# what about weather?
# look at delays by month by destination
flights %>%
group_by(month, origin) %>%
summarize(delay=mean(arr_delay)) %>%
ggplot(aes(month, delay, fill=origin)) + geom_bar(stat="identity", position="dodge")
# summertime delays from thunderstorms? winter delays from snow?
# let's look at precipitation
# first lets get the delay by date and origin
# how does weather affect flights?
weather
## # A tibble: 26,130 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 0 37.04 21.92 53.97 230 10.35702
## 2 EWR 2013 1 1 1 37.04 21.92 53.97 230 13.80936
## 3 EWR 2013 1 1 2 37.94 21.92 52.09 230 12.65858
## 4 EWR 2013 1 1 3 37.94 23.00 54.51 230 13.80936
## 5 EWR 2013 1 1 4 37.94 24.08 57.04 240 14.96014
## 6 EWR 2013 1 1 6 39.02 26.06 59.37 270 10.35702
## 7 EWR 2013 1 1 7 39.02 26.96 61.63 250 8.05546
## 8 EWR 2013 1 1 8 39.02 28.04 64.43 240 11.50780
## 9 EWR 2013 1 1 9 39.92 28.04 62.21 250 12.65858
## 10 EWR 2013 1 1 10 39.02 28.04 64.43 260 12.65858
## # ... with 26,120 more rows, and 5 more variables: wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
# create new dataset grouped by date and origin adding mean delay
delay_byday_byorigin <- flights %>%
group_by(date, origin) %>%
summarize(delay=mean(arr_delay))
delay_byday_byorigin
## # A tibble: 1,095 x 3
## # Groups: date [?]
## date origin delay
## <date> <chr> <dbl>
## 1 2013-01-01 EWR 20.886667
## 2 2013-01-01 JFK 8.088136
## 3 2013-01-01 LGA 7.885593
## 4 2013-01-02 EWR 25.439883
## 5 2013-01-02 JFK 3.268139
## 6 2013-01-02 LGA 7.659259
## 7 2013-01-03 EWR 6.102719
## 8 2013-01-03 JFK 1.766562
## 9 2013-01-03 LGA 10.238095
## 10 2013-01-04 EWR 1.768546
## # ... with 1,085 more rows
# Take weather data, group by YMD, summarize mean participation
# remove missing data, add date with lubridate
# join to data created last time
# plot delay by precip coloring by flight's origin, add loess
weather %>% group_by(year, month, day) %>%
summarize(precip = mean(precip)) %>%
na.omit() %>%
mutate(date=ymd(paste(year, month, day, sep="-"))) %>%
inner_join(delay_byday_byorigin, by="date") %>%
ggplot(aes(precip, delay, col=origin)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.00013056
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.00054722
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.7361e-07
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used
## at -0.00013056
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 0.00054722
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal
## condition number 0
## Warning in predLoess(object$y, object$x, newx = if
## (is.null(newdata)) object$x else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other
## near singularities as well. 1.7361e-07
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.00012361
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 1.528e-08
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.00012361
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.00012361
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
# Is the average delay related to the average distance flown by a plane?
flights %>% group_by(tailnum) %>%
summarize(dist=mean(distance), delay=mean(arr_delay), count=n()) %>%
ggplot(aes(dist, delay)) + geom_point(aes(size=count), alpha=1/2) + scale_size_area()