4 Data transformation

4.1 Questions

For question 3 I don’t really know where to start with calculations

library(nycflights13)
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0     ✔ purrr   1.0.1
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.3     ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

#> ── Attaching packages ──────────────────────────────────── tidyverse 1.3.2 ──
#> ✔ ggplot2 3.4.0           ✔ purrr   1.0.1.9000 
#> ✔ tibble  3.1.8           ✔ dplyr   1.0.99.9000
#> ✔ tidyr   1.2.1.9001      ✔ stringr 1.5.0      
#> ✔ readr   2.1.3           ✔ forcats 0.5.2      
#> ── Conflicts ─────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()

##Solutions

4.2 4.2.5 Exercises

1.find all flights that

a.had a arrival delay of two hours or more

filter(flights,arr_delay >= 2) %>% arrange(desc(arr_delay))

# A tibble: 127,929 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     9      641        900    1301    1242    1530    1272 HA     
 2  2013     6    15     1432       1935    1137    1607    2120    1127 MQ     
 3  2013     1    10     1121       1635    1126    1239    1810    1109 MQ     
 4  2013     9    20     1139       1845    1014    1457    2210    1007 AA     
 5  2013     7    22      845       1600    1005    1044    1815     989 MQ     
 6  2013     4    10     1100       1900     960    1342    2211     931 DL     
 7  2013     3    17     2321        810     911     135    1020     915 DL     
 8  2013     7    22     2257        759     898     121    1026     895 DL     
 9  2013    12     5      756       1700     896    1058    2020     878 AA     
10  2013     5     3     1133       2055     878    1250    2215     875 MQ     
# … with 127,919 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

b.flew to houston (IAH or HOU)

glimpse(flights)

Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

view(flights)

flights %>% filter(dest %in% c("IAH","HOU"))

# A tibble: 9,313 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     1      517        515       2     830     819      11 UA     
 2  2013     1     1      533        529       4     850     830      20 UA     
 3  2013     1     1      623        627      -4     933     932       1 UA     
 4  2013     1     1      728        732      -4    1041    1038       3 UA     
 5  2013     1     1      739        739       0    1104    1038      26 UA     
 6  2013     1     1      908        908       0    1228    1219       9 UA     
 7  2013     1     1     1028       1026       2    1350    1339      11 UA     
 8  2013     1     1     1044       1045      -1    1352    1351       1 UA     
 9  2013     1     1     1114        900     134    1447    1222     145 UA     
10  2013     1     1     1205       1200       5    1503    1505      -2 UA     
# … with 9,303 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

c.Were operated by United, American, or Delta

flights %>% filter(carrier %in% c("UA","DL","AA"))

# A tibble: 139,504 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     1      517        515       2     830     819      11 UA     
 2  2013     1     1      533        529       4     850     830      20 UA     
 3  2013     1     1      542        540       2     923     850      33 AA     
 4  2013     1     1      554        600      -6     812     837     -25 DL     
 5  2013     1     1      554        558      -4     740     728      12 UA     
 6  2013     1     1      558        600      -2     753     745       8 AA     
 7  2013     1     1      558        600      -2     924     917       7 UA     
 8  2013     1     1      558        600      -2     923     937     -14 UA     
 9  2013     1     1      559        600      -1     941     910      31 AA     
10  2013     1     1      559        600      -1     854     902      -8 UA     
# … with 139,494 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

d.departed in summer(july,august,september)

flights %>% filter(month %in% c(7,8,9))

# A tibble: 86,326 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     7     1        1       2029     212     236    2359     157 B6     
 2  2013     7     1        2       2359       3     344     344       0 B6     
 3  2013     7     1       29       2245     104     151       1     110 B6     
 4  2013     7     1       43       2130     193     322      14     188 B6     
 5  2013     7     1       44       2150     174     300     100     120 AA     
 6  2013     7     1       46       2051     235     304    2358     186 B6     
 7  2013     7     1       48       2001     287     308    2305     243 VX     
 8  2013     7     1       58       2155     183     335      43     172 B6     
 9  2013     7     1      100       2146     194     327      30     177 B6     
10  2013     7     1      100       2245     135     337     135     122 B6     
# … with 86,316 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

e.Arrived more than two hours late, but didn’t leave late

flights %>% filter(dep_delay >= 0,arr_delay < -2)

# A tibble: 40,053 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     1      559        559       0     702     706      -4 B6     
 2  2013     1     1      600        600       0     851     858      -7 B6     
 3  2013     1     1      601        600       1     844     850      -6 B6     
 4  2013     1     1      607        607       0     858     915     -17 UA     
 5  2013     1     1      615        615       0    1039    1100     -21 B6     
 6  2013     1     1      615        615       0     833     842      -9 DL     
 7  2013     1     1      644        636       8     931     940      -9 UA     
 8  2013     1     1      646        645       1     910     916      -6 UA     
 9  2013     1     1      646        645       1    1023    1030      -7 UA     
10  2013     1     1      655        655       0    1021    1030      -9 DL     
# … with 40,043 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

f.Were delayed by at least an hour, but made up over 30 minutes in flight

flights %>% filter(dep_delay <= -1,arr_delay > 30)

# A tibble: 3,041 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     1      559        600      -1     941     910      31 AA     
 2  2013     1     1      724        730      -6    1111    1040      31 AA     
 3  2013     1     1      754        755      -1    1103    1030      33 WN     
 4  2013     1     1      833        835      -2    1134    1102      32 F9     
 5  2013     1     1     1029       1030      -1    1427    1355      32 AA     
 6  2013     1     1     1745       1749      -4    1943    1909      34 B6     
 7  2013     1     2      656        705      -9    1014     940      34 MQ     
 8  2013     1     2      822        823      -1    1206    1135      31 UA     
 9  2013     1     2      836        840      -4    1114    1036      38 9E     
10  2013     1     2      841        845      -4    1134    1024      70 9E     
# … with 3,031 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

2.Sort flights to find the flights with longest departure delays. Find the flights that left earliest in the morning.

flights %>% arrange(dep_delay) %>% arrange(time_hour) %>% glimpse()

Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       <int> 554, 544, 559, 517, 542, 533, 602, 622, 637, 554, 624, …
$ sched_dep_time <int> 558, 545, 559, 515, 540, 529, 610, 630, 645, 600, 630, …
$ dep_delay      <dbl> -4, -1, 0, 2, 2, 4, -8, -8, -8, -6, -6, -6, -5, -4, -4,…
$ arr_time       <int> 740, 1004, 702, 830, 923, 850, 812, 1017, 930, 812, 909…
$ sched_arr_time <int> 728, 1022, 706, 819, 850, 830, 820, 1014, 935, 837, 840…
$ arr_delay      <dbl> 12, -18, -4, 11, 33, 20, -8, 3, -5, -25, 29, 10, 19, -1…
$ carrier        <chr> "UA", "B6", "B6", "UA", "AA", "UA", "DL", "US", "B6", "…
$ flight         <int> 1696, 725, 1806, 1545, 1141, 1714, 1919, 245, 389, 461,…
$ tailnum        <chr> "N39463", "N804JB", "N708JB", "N14228", "N619AA", "N242…
$ origin         <chr> "EWR", "JFK", "JFK", "EWR", "JFK", "LGA", "LGA", "EWR",…
$ dest           <chr> "ORD", "BQN", "BOS", "IAH", "MIA", "IAH", "MSP", "PHX",…
$ air_time       <dbl> 150, 183, 44, 227, 160, 227, 170, 342, 144, 116, 190, 1…
$ distance       <dbl> 719, 1576, 187, 1400, 1089, 1416, 1020, 2133, 950, 762,…
$ hour           <dbl> 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6…
$ minute         <dbl> 58, 45, 59, 15, 40, 29, 10, 30, 45, 0, 30, 30, 0, 10, 1…
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

3.Sort flights to find the fastest flights (Hint: try sorting by a calculation).

flights %>% arrange(desc(arr_delay))

# A tibble: 336,776 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     9      641        900    1301    1242    1530    1272 HA     
 2  2013     6    15     1432       1935    1137    1607    2120    1127 MQ     
 3  2013     1    10     1121       1635    1126    1239    1810    1109 MQ     
 4  2013     9    20     1139       1845    1014    1457    2210    1007 AA     
 5  2013     7    22      845       1600    1005    1044    1815     989 MQ     
 6  2013     4    10     1100       1900     960    1342    2211     931 DL     
 7  2013     3    17     2321        810     911     135    1020     915 DL     
 8  2013     7    22     2257        759     898     121    1026     895 DL     
 9  2013    12     5      756       1700     896    1058    2020     878 AA     
10  2013     5     3     1133       2055     878    1250    2215     875 MQ     
# … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

4.Was there a flight on every day of 2013?

yes

flights %>% arrange(desc(day)) %>% arrange(desc(month))

# A tibble: 336,776 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013    12    31       13       2359      14     439     437       2 B6     
 2  2013    12    31       18       2359      19     449     444       5 DL     
 3  2013    12    31       26       2245     101     129    2353      96 B6     
 4  2013    12    31      459        500      -1     655     651       4 US     
 5  2013    12    31      514        515      -1     814     812       2 UA     
 6  2013    12    31      549        551      -2     925     900      25 UA     
 7  2013    12    31      550        600     -10     725     745     -20 AA     
 8  2013    12    31      552        600      -8     811     826     -15 EV     
 9  2013    12    31      553        600      -7     741     754     -13 DL     
10  2013    12    31      554        550       4    1024    1027      -3 B6     
# … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

5.Which flights traveled the farthest distance? Which traveled the least distance?

flights %>% arrange(distance)

# A tibble: 336,776 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     7    27       NA        106      NA      NA     245      NA US     
 2  2013     1     3     2127       2129      -2    2222    2224      -2 EV     
 3  2013     1     4     1240       1200      40    1333    1306      27 EV     
 4  2013     1     4     1829       1615     134    1937    1721     136 EV     
 5  2013     1     4     2128       2129      -1    2218    2224      -6 EV     
 6  2013     1     5     1155       1200      -5    1241    1306     -25 EV     
 7  2013     1     6     2125       2129      -4    2224    2224       0 EV     
 8  2013     1     7     2124       2129      -5    2212    2224     -12 EV     
 9  2013     1     8     2127       2130      -3    2304    2225      39 EV     
10  2013     1     9     2126       2129      -3    2217    2224      -7 EV     
# … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

flights %>% arrange(desc(distance))

# A tibble: 336,776 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     1      857        900      -3    1516    1530     -14 HA     
 2  2013     1     2      909        900       9    1525    1530      -5 HA     
 3  2013     1     3      914        900      14    1504    1530     -26 HA     
 4  2013     1     4      900        900       0    1516    1530     -14 HA     
 5  2013     1     5      858        900      -2    1519    1530     -11 HA     
 6  2013     1     6     1019        900      79    1558    1530      28 HA     
 7  2013     1     7     1042        900     102    1620    1530      50 HA     
 8  2013     1     8      901        900       1    1504    1530     -26 HA     
 9  2013     1     9      641        900    1301    1242    1530    1272 HA     
10  2013     1    10      859        900      -1    1449    1530     -41 HA     
# … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

6.Does it matter what order you used filter() and arrange() if you’re using both? Why/why not? Think about the results and how much work the functions would have to do.

I don’t think it matters the two functions would go over each other with no problems either way.

4.3 4.3.5 Exercises

Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?

no they don’t match

flights %>% mutate(timearrdep = arr_time - dep_time)

# A tibble: 336,776 × 20
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     1      517        515       2     830     819      11 UA     
 2  2013     1     1      533        529       4     850     830      20 UA     
 3  2013     1     1      542        540       2     923     850      33 AA     
 4  2013     1     1      544        545      -1    1004    1022     -18 B6     
 5  2013     1     1      554        600      -6     812     837     -25 DL     
 6  2013     1     1      554        558      -4     740     728      12 UA     
 7  2013     1     1      555        600      -5     913     854      19 B6     
 8  2013     1     1      557        600      -3     709     723     -14 EV     
 9  2013     1     1      557        600      -3     838     846      -8 B6     
10  2013     1     1      558        600      -2     753     745       8 AA     
# … with 336,766 more rows, 10 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, timearrdep <int>, and abbreviated variable
#   names ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

2.Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?

they show when a plane left when it was supposed to leave and if there was a delay

3.Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights

flights %>% select(dep_time, dep_delay, arr_time, arr_delay)

# A tibble: 336,776 × 4
   dep_time dep_delay arr_time arr_delay
      <int>     <dbl>    <int>     <dbl>
 1      517         2      830        11
 2      533         4      850        20
 3      542         2      923        33
 4      544        -1     1004       -18
 5      554        -6      812       -25
 6      554        -4      740        12
 7      555        -5      913        19
 8      557        -3      709       -14
 9      557        -3      838        -8
10      558        -2      753         8
# … with 336,766 more rows

select(flights, dep_time:arr_delay, -sched_dep_time,-sched_arr_time)

# A tibble: 336,776 × 4
   dep_time dep_delay arr_time arr_delay
      <int>     <dbl>    <int>     <dbl>
 1      517         2      830        11
 2      533         4      850        20
 3      542         2      923        33
 4      544        -1     1004       -18
 5      554        -6      812       -25
 6      554        -4      740        12
 7      555        -5      913        19
 8      557        -3      709       -14
 9      557        -3      838        -8
10      558        -2      753         8
# … with 336,766 more rows

4.What happens if you include the name of a variable multiple times in a select() call?

it just prints out the same outcome as if i only used one

flights %>% select(dep_time, dep_time)

# A tibble: 336,776 × 1
   dep_time
      <int>
 1      517
 2      533
 3      542
 4      544
 5      554
 6      554
 7      555
 8      557
 9      557
10      558
# … with 336,766 more rows

5.What does the any_of() function do? Why might it be helpful in conjunction with this vector?

variables <- c("year", "month", "day", "dep_delay", "arr_delay")

no it doesn’t surprise me.it looks like case doesn’t matter.i can change time to dep

select(flights, contains("TIME"))

# A tibble: 336,776 × 6
   dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
      <int>          <int>    <int>          <int>    <dbl> <dttm>             
 1      517            515      830            819      227 2013-01-01 05:00:00
 2      533            529      850            830      227 2013-01-01 05:00:00
 3      542            540      923            850      160 2013-01-01 05:00:00
 4      544            545     1004           1022      183 2013-01-01 05:00:00
 5      554            600      812            837      116 2013-01-01 06:00:00
 6      554            558      740            728      150 2013-01-01 05:00:00
 7      555            600      913            854      158 2013-01-01 06:00:00
 8      557            600      709            723       53 2013-01-01 06:00:00
 9      557            600      838            846      140 2013-01-01 06:00:00
10      558            600      753            745      138 2013-01-01 06:00:00
# … with 336,766 more rows

4.4 4.4.6 Exercises

Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights |> group_by(carrier, dest) |> summarize(n()))

it looks like 9e has the worst delays

flights |> group_by(carrier, dep_delay) |> summarize(n())

`summarise()` has grouped output by 'carrier'. You can override using the
`.groups` argument.

# A tibble: 4,032 × 3
# Groups:   carrier [16]
   carrier dep_delay `n()`
   <chr>       <dbl> <int>
 1 9E            -24     2
 2 9E            -23     1
 3 9E            -22     1
 4 9E            -21     1
 5 9E            -20     2
 6 9E            -19     6
 7 9E            -18    11
 8 9E            -17    12
 9 9E            -16    18
10 9E            -15    40
# … with 4,022 more rows

flights %>% arrange(desc(dep_delay))

# A tibble: 336,776 × 19
    year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
   <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
 1  2013     1     9      641        900    1301    1242    1530    1272 HA     
 2  2013     6    15     1432       1935    1137    1607    2120    1127 MQ     
 3  2013     1    10     1121       1635    1126    1239    1810    1109 MQ     
 4  2013     9    20     1139       1845    1014    1457    2210    1007 AA     
 5  2013     7    22      845       1600    1005    1044    1815     989 MQ     
 6  2013     4    10     1100       1900     960    1342    2211     931 DL     
 7  2013     3    17     2321        810     911     135    1020     915 DL     
 8  2013     6    27      959       1900     899    1236    2226     850 DL     
 9  2013     7    22     2257        759     898     121    1026     895 DL     
10  2013    12     5      756       1700     896    1058    2020     878 AA     
# … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, and abbreviated variable names
#   ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay

2.Find the most delayed flight to each destination.

flights %>% group_by(dep_delay,dest) %>% summarise(n())

`summarise()` has grouped output by 'dep_delay'. You can override using the
`.groups` argument.

# A tibble: 16,426 × 3
# Groups:   dep_delay [528]
   dep_delay dest  `n()`
       <dbl> <chr> <int>
 1       -43 DEN       1
 2       -33 MSY       1
 3       -32 IAD       1
 4       -30 TPA       1
 5       -27 DEN       1
 6       -26 DTW       1
 7       -25 DTW       1
 8       -25 TYS       1
 9       -24 BUF       1
10       -24 FLL       1
# … with 16,416 more rows

3.How do delays vary over the course of the day. Illustrate your answer with a plot.

ggplot(flights,aes(x = time_hour,y = dep_delay)) + geom_smooth()

4.What happens if you supply a negative n to slice_min() and friends?

you get a error

flights %>% slice_min(-n)

Error in `slice_min()`:
! Can't compute indices.
Caused by error in `-n`:
! invalid argument to unary operator

5.Explain what count() does in terms of the dplyr verbs you just learn. What does the sort argument to count() do?

count,counts the number of things in each specific group,will show the largest group at the top

6.Suppose we have the following tiny data frame

df <- tibble(
  x = 1:5,
  y = c("a", "b", "a", "a", "b"),
  z = c("K", "K", "L", "L", "K")
)

a.What does the following code do? Run it, analyze the result, and describe what group_by() does.

it would group by y

df |>
  group_by(y)

# A tibble: 5 × 3
# Groups:   y [2]
      x y     z    
  <int> <chr> <chr>
1     1 a     K    
2     2 b     K    
3     3 a     L    
4     4 a     L    
5     5 b     K

b.What does the following code do? Run it, analyze the result, and describe what arrange() does. Also comment on how it’s different from the group_by() in part (a)?

what arrange did here is arranged y in alphabetical order in this case

df |>
  arrange(y)

# A tibble: 5 × 3
      x y     z    
  <int> <chr> <chr>
1     1 a     K    
2     3 a     L    
3     4 a     L    
4     2 b     K    
5     5 b     K

c.What does the following code do? Run it, analyze the result, and describe what the pipeline does

the pipeline tells the code what to do next, it groups by y then finds the average of x

df |>
  group_by(y) |>
  summarize(mean_x = mean(x))

# A tibble: 2 × 2
  y     mean_x
  <chr>  <dbl>
1 a       2.67
2 b       3.5

d.What does the following code do? Run it, analyze the result, and describe what the pipeline does. Then, comment on what the message says.

the code groups by y and z then averages x,then it shows theres a average of 1 k for a y,an average of 3.5 L’s for the second a in y and an average of 3.5 k’s for b

df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x))

`summarise()` has grouped output by 'y'. You can override using the `.groups`
argument.

# A tibble: 3 × 3
# Groups:   y [2]
  y     z     mean_x
  <chr> <chr>  <dbl>
1 a     K        1  
2 a     L        3.5
3 b     K        3.5

e.What does the following code do? Run it, analyze the result, and describe what the pipeline does. How is the output different from the one in part (d)

this completely gets rid of the group by part the thing that does this is .groups = “drop”

df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x), .groups = "drop")

# A tibble: 3 × 3
  y     z     mean_x
  <chr> <chr>  <dbl>
1 a     K        1  
2 a     L        3.5
3 b     K        3.5

f.What do the following pipelines do? Run both, analyze the results, and describe what each pipeline does. How are the outputs of the two pipelines different?

these codes do the same thing the only difference is that the first code shows the information in a more efficient way due to summarize

df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x))

`summarise()` has grouped output by 'y'. You can override using the `.groups`
argument.

# A tibble: 3 × 3
# Groups:   y [2]
  y     z     mean_x
  <chr> <chr>  <dbl>
1 a     K        1  
2 a     L        3.5
3 b     K        3.5

df |>
  group_by(y, z) |>
  mutate(mean_x = mean(x))

# A tibble: 5 × 4
# Groups:   y, z [3]
      x y     z     mean_x
  <int> <chr> <chr>  <dbl>
1     1 a     K        1  
2     2 b     K        3.5
3     3 a     L        3.5
4     4 a     L        3.5
5     5 b     K        3.5