Data Management and Intro to fpp3 Package

Notes and Hands-on

Authors

Kamarul Imran Musa

Jason Ng

Published

November 7, 2024

Data Management

we will use tidy principle
examples including dataset and codes come from The Epidemiologist R Handbook

Analysis and Modelling Workflow

This my proposed workflow
understand objective at each step
identify at each step
- package
- functions
- parameters
- arguments

Read data into R

package

library(readxl)

function

linelist <- read_xlsx("linelist_cleaned.xlsx")

Explore data

Variables and observations

package

library(tidyverse)

function glimpse() to explore data
- 5888 observations
- 30 variables
pay attention to date variables dttm

glimpse(linelist)

Rows: 5,888
Columns: 30
$ case_id              <chr> "5fe599", "8689b7", "11f8ea", "b8812a", "893f25",…
$ generation           <dbl> 4, 4, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 6, 5, 6, 9, 1…
$ date_infection       <dttm> 2014-05-08, NA, NA, 2014-05-04, 2014-05-18, 2014…
$ date_onset           <dttm> 2014-05-13, 2014-05-13, 2014-05-16, 2014-05-18, …
$ date_hospitalisation <dttm> 2014-05-15, 2014-05-14, 2014-05-18, 2014-05-20, …
$ date_outcome         <dttm> NA, 2014-05-18, 2014-05-30, NA, 2014-05-29, 2014…
$ outcome              <chr> NA, "Recover", "Recover", NA, "Recover", "Recover…
$ gender               <chr> "m", "f", "m", "f", "m", "f", "f", "f", "m", "f",…
$ age                  <dbl> 2, 3, 56, 18, 3, 16, 16, 0, 61, 27, 12, 42, 19, 7…
$ age_unit             <chr> "years", "years", "years", "years", "years", "yea…
$ age_years            <dbl> 2, 3, 56, 18, 3, 16, 16, 0, 61, 27, 12, 42, 19, 7…
$ age_cat              <chr> "0-4", "0-4", "50-69", "15-19", "0-4", "15-19", "…
$ age_cat5             <chr> "0-4", "0-4", "55-59", "15-19", "0-4", "15-19", "…
$ hospital             <chr> "Other", "Missing", "St. Mark's Maternity Hospita…
$ lon                  <dbl> -13.21574, -13.21523, -13.21291, -13.23637, -13.2…
$ lat                  <dbl> 8.468973, 8.451719, 8.464817, 8.475476, 8.460824,…
$ infector             <chr> "f547d6", NA, NA, "f90f5f", "11f8ea", "aec8ec", "…
$ source               <chr> "other", NA, NA, "other", "other", "other", "othe…
$ wt_kg                <dbl> 27, 25, 91, 41, 36, 56, 47, 0, 86, 69, 67, 84, 68…
$ ht_cm                <dbl> 48, 59, 238, 135, 71, 116, 87, 11, 226, 174, 112,…
$ ct_blood             <dbl> 22, 22, 21, 23, 23, 21, 21, 22, 22, 22, 22, 22, 2…
$ fever                <chr> "no", NA, NA, "no", "no", "no", NA, "no", "no", "…
$ chills               <chr> "no", NA, NA, "no", "no", "no", NA, "no", "no", "…
$ cough                <chr> "yes", NA, NA, "no", "yes", "yes", NA, "yes", "ye…
$ aches                <chr> "no", NA, NA, "no", "no", "no", NA, "no", "no", "…
$ vomit                <chr> "yes", NA, NA, "no", "yes", "yes", NA, "yes", "ye…
$ temp                 <dbl> 36.8, 36.9, 36.9, 36.8, 36.9, 37.6, 37.3, 37.0, 3…
$ time_admission       <chr> NA, "09:36", "16:48", "11:22", "12:60", "14:13", …
$ bmi                  <dbl> 117.18750, 71.81844, 16.06525, 22.49657, 71.41440…
$ days_onset_hosp      <dbl> 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 1, 0, 2, 0, 1…

View(linelist)

Descriptive analysis

load package

library(gtsummary)

convert chr to factor

linelist <- linelist |>
  mutate(across(where(is.character ), as_factor))

how many males and females

linelist |> count(gender)

# A tibble: 3 × 2
  gender     n
  <fct>  <int>
1 m       2803
2 f       2807
3 <NA>     278

describe variables based on gender

linelist |>
  select(age, age_cat, hospital, wt_kg, ht_cm, fever, gender) |>
  tbl_summary(by = gender)

Characteristic	m N = 2,803 ¹	f N = 2,807 ¹
age	17 (8, 28)	11 (5, 19)
age_cat
0-4	416 (15%)	640 (23%)
50-69	91 (3.2%)	2 (<0.1%)
15-19	364 (13%)	359 (13%)
20-29	575 (21%)	468 (17%)
10-14	383 (14%)	518 (18%)
30-49	557 (20%)	179 (6.4%)
5-9	412 (15%)	641 (23%)
70+	5 (0.2%)	0 (0%)
hospital
Other	423 (15%)	418 (15%)
Missing	696 (25%)	700 (25%)
St. Mark's Maternity Hospital (SMMH)	196 (7.0%)	209 (7.4%)
Port Hospital	823 (29%)	857 (31%)
Military Hospital	435 (16%)	421 (15%)
Central Hospital	230 (8.2%)	202 (7.2%)
wt_kg	63 (51, 71)	47 (35, 57)
ht_cm	147 (115, 173)	112 (75, 139)
fever	2,172 (81%)	2,161 (81%)
Unknown	113	126
¹ Median (Q1, Q3); n (%)

count number of cases by day

# make dataset of daily counts
daily_counts <- linelist |>
  count(date_hospitalisation, name = "new_cases")
View(daily_counts)

make plots

daily_counts |>
  ggplot(aes(x = date_hospitalisation, y = new_cases )) +
  geom_line(size = 0.5)

we may want to smooth it using rolling average
for this we neeed
- slider
- tidyquant

library(slider)
library(tidyquant)

let’s get 7-day case incidence

rolling <- daily_counts |>
  mutate(                                # create new columns
    # Using slide_dbl()
    ###################
    reg_7day = slide_dbl(
      new_cases,                         # calculate on new_cases
      .f = ~sum(.x, na.rm = T),          # function is sum() with missing values removed
      .before = 6),                      # window is the ROW and 6 prior ROWS
    
    # Using slide_index_dbl()
    #########################
    indexed_7day = slide_index_dbl(
        new_cases,                       # calculate on new_cases
        .i = date_hospitalisation,       # indexed with date_onset 
        .f = ~sum(.x, na.rm = TRUE),     # function is sum() with missing values removed
        .before = days(6))               # window is the DAY and 6 prior DAYS
    )
View(rolling)

plot the 7-day incidence

rolling |> 
  ggplot()+
  geom_line(mapping = aes(x = date_hospitalisation, y = indexed_7day), size = 1)

Quick tutorial for fpp3 package

For more info read
- the fpp3 documentation
- the online Forecasting Principle and Practice book
load library fpp3

library(fpp3)

read a built in data named aus_retail

aus_retail |>
  slice_head(n=4) |>
  View()

There are 5 variables.

Time variable is Month
Outcome variable is Turnover
Make a time series plot for Series IDA3349640L

aus_retail |>
  filter(`Series ID`=="A3349640L") |>
  autoplot(Turnover)

Make a forecast
- for Series ID A3349640L
- using ETS model
- for 2 years ahead

aus_retail |>
  filter(`Series ID`=="A3349640L") |>
  model(ETS(Turnover)) |>
  forecast(h = "2 years")

# A fable: 24 x 6 [1M]
# Key:     State, Industry, .model [1]
   State    Industry                                 .model           Month
   <chr>    <chr>                                    <chr>            <mth>
 1 Victoria Cafes, restaurants and catering services ETS(Turnover) 2019 Jan
 2 Victoria Cafes, restaurants and catering services ETS(Turnover) 2019 Feb
 3 Victoria Cafes, restaurants and catering services ETS(Turnover) 2019 Mar
 4 Victoria Cafes, restaurants and catering services ETS(Turnover) 2019 Apr
 5 Victoria Cafes, restaurants and catering services ETS(Turnover) 2019 May
 6 Victoria Cafes, restaurants and catering services ETS(Turnover) 2019 Jun
 7 Victoria Cafes, restaurants and catering services ETS(Turnover) 2019 Jul
 8 Victoria Cafes, restaurants and catering services ETS(Turnover) 2019 Aug
 9 Victoria Cafes, restaurants and catering services ETS(Turnover) 2019 Sep
10 Victoria Cafes, restaurants and catering services ETS(Turnover) 2019 Oct
# ℹ 14 more rows
# ℹ 2 more variables: Turnover <dist>, .mean <dbl>

Summary

We have learned and know how to

use tidyverse and other packages for data management
read data, explore and make plots including the rolling average
fpp3 package for time series and forecasting