library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(nycflights13)
Three types of data/time data:
<date>
.<time>
.<dttm>
.In the flights
tibble, the last variable time_hour
is in the data-time format:
flights %>% print(width = Inf)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 12 UA 1696 N39463 EWR ORD 150 719 5 58
## 7 19 B6 507 N516JB EWR FLL 158 1065 6 0
## 8 -14 EV 5708 N829AS LGA IAD 53 229 6 0
## 9 -8 B6 79 N593JB JFK MCO 140 944 6 0
## 10 8 AA 301 N3ALAA LGA ORD 138 733 6 0
## time_hour
## <dttm>
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 05:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-01 05:00:00
## 7 2013-01-01 06:00:00
## 8 2013-01-01 06:00:00
## 9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # … with 336,766 more rows
Today:
# current date
today()
## [1] "2020-02-03"
# current date-time
now()
## [1] "2020-02-03 23:45:02 PST"
ymd("2020-01-30")
## [1] "2020-01-30"
mdy("January 30th, 2020")
## [1] "2020-01-30"
dmy("30-Jan-2020")
## [1] "2020-01-30"
ymd_hms("2020-01-30 14:57:25")
## [1] "2020-01-30 14:57:25 UTC"
ymd_hm("2020-01-30 14:57")
## [1] "2020-01-30 14:57:00 UTC"
ymd(20200130)
## [1] "2020-01-30"
flights %>%
select(year, month, day, hour, minute) %>%
mutate(departure = make_datetime(year, month, day, hour, minute))
## # A tibble: 336,776 x 6
## year month day hour minute departure
## <int> <int> <int> <dbl> <dbl> <dttm>
## 1 2013 1 1 5 15 2013-01-01 05:15:00
## 2 2013 1 1 5 29 2013-01-01 05:29:00
## 3 2013 1 1 5 40 2013-01-01 05:40:00
## 4 2013 1 1 5 45 2013-01-01 05:45:00
## 5 2013 1 1 6 0 2013-01-01 06:00:00
## 6 2013 1 1 5 58 2013-01-01 05:58:00
## 7 2013 1 1 6 0 2013-01-01 06:00:00
## 8 2013 1 1 6 0 2013-01-01 06:00:00
## 9 2013 1 1 6 0 2013-01-01 06:00:00
## 10 2013 1 1 6 0 2013-01-01 06:00:00
## # … with 336,766 more rows
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
flights_dt
## # A tibble: 328,063 x 9
## origin dest dep_delay arr_delay dep_time sched_dep_time
## <chr> <chr> <dbl> <dbl> <dttm> <dttm>
## 1 EWR IAH 2 11 2013-01-01 05:17:00 2013-01-01 05:15:00
## 2 LGA IAH 4 20 2013-01-01 05:33:00 2013-01-01 05:29:00
## 3 JFK MIA 2 33 2013-01-01 05:42:00 2013-01-01 05:40:00
## 4 JFK BQN -1 -18 2013-01-01 05:44:00 2013-01-01 05:45:00
## 5 LGA ATL -6 -25 2013-01-01 05:54:00 2013-01-01 06:00:00
## 6 EWR ORD -4 12 2013-01-01 05:54:00 2013-01-01 05:58:00
## 7 EWR FLL -5 19 2013-01-01 05:55:00 2013-01-01 06:00:00
## 8 LGA IAD -3 -14 2013-01-01 05:57:00 2013-01-01 06:00:00
## 9 JFK MCO -3 -8 2013-01-01 05:57:00 2013-01-01 06:00:00
## 10 LGA ORD -2 8 2013-01-01 05:58:00 2013-01-01 06:00:00
## # … with 328,053 more rows, and 3 more variables: arr_time <dttm>,
## # sched_arr_time <dttm>, air_time <dbl>
Now we can visualize the distribution of departure times across the year
flights_dt %>%
ggplot(aes(x= dep_time)) +
geom_freqpoly(binwidth = 86400) # 86400 seconds = 1 day
or within a single day:
flights_dt %>%
filter(dep_time < ymd(20130102)) %>%
ggplot(aes(dep_time)) +
geom_freqpoly(binwidth = 600) # 600 s = 10 minutes
datetime <- ymd_hms("2020-01-30 15:34:56")
year(datetime)
## [1] 2020
month(datetime)
## [1] 1
mday(datetime)
## [1] 30
yday(datetime)
## [1] 30
wday(datetime)
## [1] 5
More information in month()
and wday()
:
month(datetime, label = TRUE, abbr = FALSE)
## [1] January
## 12 Levels: January < February < March < April < May < June < ... < December
wday(datetime, label = TRUE, abbr = FALSE)
## [1] Thursday
## 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
Visualize number of departures during a week:
flights_dt %>%
mutate(wday = wday(dep_time, label = TRUE)) %>%
ggplot(aes(x = wday)) +
geom_bar()
floor_date()
, round_date()
, ceiling_date()
:
flights_dt %>%
count(week = floor_date(dep_time, "week")) %>%
ggplot(aes(x = week, y = n)) +
geom_line()
Substract two dates we get a difftime
object:
# How old is Hadley?
h_age <- today() - ymd(19791014)
h_age
## Time difference of 14722 days
lubridate provides the duration object that always uses seconds:
as.duration(h_age)
## [1] "1271980800s (~40.31 years)"
Constructors for duration:
dseconds(5)
## [1] "5s"
dminutes(10)
## [1] "600s (~10 minutes)"
dhours(c(12, 24))
## [1] "43200s (~12 hours)" "86400s (~1 days)"
ddays(0:5)
## [1] "0s" "86400s (~1 days)" "172800s (~2 days)"
## [4] "259200s (~3 days)" "345600s (~4 days)" "432000s (~5 days)"
dweeks(3)
## [1] "1814400s (~3 weeks)"
dyears(1)
## [1] "31536000s (~52.14 weeks)"
Durations represent an exact number of seconds:
one_pm <- ymd_hms("2016-03-12 13:00:00", tz = "America/New_York")
one_pm
## [1] "2016-03-12 13:00:00 EST"
one_pm + ddays(1)
## [1] "2016-03-13 14:00:00 EDT"
Periods are time spans but don’t have a fixed length in seconds, instead they work with “human” times, like days and months.
one_pm
## [1] "2016-03-12 13:00:00 EST"
one_pm + days(1)
## [1] "2016-03-13 13:00:00 EDT"
Constructors for period:
seconds(15)
## [1] "15S"
minutes(10)
## [1] "10M 0S"
hours(c(12, 24))
## [1] "12H 0M 0S" "24H 0M 0S"
days(7)
## [1] "7d 0H 0M 0S"
months(1:6)
## [1] "1m 0d 0H 0M 0S" "2m 0d 0H 0M 0S" "3m 0d 0H 0M 0S" "4m 0d 0H 0M 0S"
## [5] "5m 0d 0H 0M 0S" "6m 0d 0H 0M 0S"
weeks(3)
## [1] "21d 0H 0M 0S"
years(1)
## [1] "1y 0m 0d 0H 0M 0S"
Some planes appear to have arrived at their destination before they departed from New York City.
flights_dt %>%
filter(arr_time < dep_time) %>%
print(width = Inf)
## # A tibble: 10,633 x 9
## origin dest dep_delay arr_delay dep_time sched_dep_time
## <chr> <chr> <dbl> <dbl> <dttm> <dttm>
## 1 EWR BQN 9 -4 2013-01-01 19:29:00 2013-01-01 19:20:00
## 2 JFK DFW 59 NA 2013-01-01 19:39:00 2013-01-01 18:40:00
## 3 EWR TPA -2 9 2013-01-01 20:58:00 2013-01-01 21:00:00
## 4 EWR SJU -6 -12 2013-01-01 21:02:00 2013-01-01 21:08:00
## 5 EWR SFO 11 -14 2013-01-01 21:08:00 2013-01-01 20:57:00
## 6 LGA FLL -10 -2 2013-01-01 21:20:00 2013-01-01 21:30:00
## 7 EWR MCO 41 43 2013-01-01 21:21:00 2013-01-01 20:40:00
## 8 JFK LAX -7 -24 2013-01-01 21:28:00 2013-01-01 21:35:00
## 9 EWR FLL 49 28 2013-01-01 21:34:00 2013-01-01 20:45:00
## 10 EWR FLL -9 -14 2013-01-01 21:36:00 2013-01-01 21:45:00
## arr_time sched_arr_time air_time
## <dttm> <dttm> <dbl>
## 1 2013-01-01 00:03:00 2013-01-01 00:07:00 192
## 2 2013-01-01 00:29:00 2013-01-01 21:51:00 NA
## 3 2013-01-01 00:08:00 2013-01-01 23:59:00 159
## 4 2013-01-01 01:46:00 2013-01-01 01:58:00 199
## 5 2013-01-01 00:25:00 2013-01-01 00:39:00 354
## 6 2013-01-01 00:16:00 2013-01-01 00:18:00 160
## 7 2013-01-01 00:06:00 2013-01-01 23:23:00 143
## 8 2013-01-01 00:26:00 2013-01-01 00:50:00 338
## 9 2013-01-01 00:20:00 2013-01-01 23:52:00 152
## 10 2013-01-01 00:25:00 2013-01-01 00:39:00 154
## # … with 10,623 more rows
These are the overnight flights. Let’s fix this:
flights_dt <- flights_dt %>%
mutate(
overnight = arr_time < dep_time,
arr_time = arr_time + days(overnight * 1),
sched_arr_time = sched_arr_time + days(overnight * 1)
)
### Intervals