1. Exploratory analysis

Visualize the data with line plots, ACF plots, and PACF plots to get a sense of the dynamics of each series. For each series, you will need to test whether it is stationary or has unit roots. A time series process is considered stationary if it has a constant mean, a constant variance, and constant covariance. First decide what to set \(D_t\) to based on the overall pattern.

Scenario Deterministics Function
Constant mean of 0 \(D_t=0\) ur_tests(type="none") in Step 2
Constant nonzero mean \(D_t=c\) ur_tests(type="constant") in Step 2
Trending mean \(D_t = c + \delta t\) ur_tests(type="trend") in Step 2
Unsure of pattern Try \(D_t = c + \delta t\) then \(D_t=c\) phi_tests in Step 2
Structural break at known time Add intervention to \(D_t\) It’s complicated
Structural break at unknown time See rows 1-3 urca::ur.za


2. Test for unit roots

Use the ur_tests function in tsatools to test for unit roots. This function automatically selects \(p_{max}\) (although you can manually input one too), conducts four types of tests with various lags (ADF, PP, DF-GLS, and KPSS), and shows you the top results. This function calls adf_tests in tsatools (adapted from Linn’s PGTSadfv2.r), which you can also use to view all ADF tests with lags from 0 to \(p_{max}\).

If unsure about \(D_t\), use the phi_tests function in tsatools to try different deterministics and test for unit roots. The function calls adf_tests(type="trend") and tests with \(\tau_\tau\), \(\phi_3\), and \(\phi_2\), and if needed it then calls adf_tests(type="constant")and conducts tests with \(\tau_\mu\) and \(\phi_1\).

ur_tests (y, type, …)

y = a vector or time series object

type = the model deterministics, either “none”, “constant”, or “trend”

ur_tests(df$app, type="constant", format="html")
Unit Root Test Results
Test Lag strategy Lags Statistic p-value Critical value Conclusion
ADF Smallest lags 0 -5.369 0 Stationary
Smallest AIC 0 -5.369 0 Stationary
Smallest BIC 0 -5.369 0 Stationary
First sig. diffs. 14 -4.399 0 Stationary
Phillips-Perron Short 6 -5.421 -2.866 Stationary
Long 19 -5.274 -2.866 Stationary
DF-GLS Manual 19 -1.067 -1.940 Unit roots
KPSS Short 6 0.738 0.463 Unit roots
Long 19 0.347 0.463 Stationary

Observations: 709

#' Unit root tests
#'
#' Conducts multiple types of tests for unit roots and automatically selects the best fitting models to display.
#'
#' @param y a vector or `ts` object
#' @param pmax the maximum lag to test; if NULL, `pmax` is automatically chosen with 12*(t/100)^(.25)
#' @param lmtestorder the order for the Lagrange multiplier tests for serial correlation; if NULL, `pmax` is used
#' @param type the model deterministics
#' @param tests a character vector specifying which tests to conduct: "adf" for augmented Dickey-Fuller tests, "pp" for Phillips-Perron tests, "dfgls" for Dickey-Fuller tests with generalized least squares, and "kpss" for Kwiatkowski-Phillips-Schmidt-Shin tests. Can include anywhere from one to four of these tests; defaults to all four.
#' @param format aargument to be passed to knitr::kable for displaying the results output. Possible values are `"latex"`, `"html"`, `"pipe"`, `"simple"`, `"rst"`, and `"none"`. If `"none"` is selected, no output is displayed in the console.
#' @import tidyverse lmtest urca knitr
#' @export


ur_tests = function(y, pmax=NULL, lmtestorder=NULL, type = c("none", "constant", "trend"),
                    tests = c("adf", "pp", "dfgls", "kpss"), format="simple") {

  # Determine pmax
  t = length(y)
  if (is.null(pmax)) pmax = floor(12*(t/100)^(.25))
  if (is.null(lmtestorder)) lmtestorder = pmax

  # Initialize list to store data
  urs = list()

  # Determine if tests and types are compatible
  if (type=="none" & !("adf" %in% tests)) {
    stop(paste("PP, DF-GLS, and KPSS tests require a constant or trend.",
               "Try setting `type` to '\"constant\"' or '\"trend\"' or setting `tests` to '\"adf\"'."))
  }

  # ADF TESTS -------------------------------------------------------------
  if ("adf" %in% tests) {
    adf = tsatools::adf_tests(y, pmax, lmtestorder, type, format="none")

    # Various selections for p
    p = c(
      `Smallest lags` = which(adf$Lags==min(adf$Lags[adf$`LM p-value`>.05])),
      `Smallest AIC` = which(adf$AIC==min(adf$AIC[adf$`LM p-value`>.05])),
      `Smallest BIC` = which(adf$BIC==min(adf$BIC[adf$`LM p-value`>.05])),
      `First sig. diffs.` = first(which(adf$`Diff p-value`[adf$`LM p-value`>.05] < .10))
    )

    # Collect ADF tests
    urs[[length(urs)+1]] = adf[p,] %>%
      mutate(Test = "ADF",
             `Lag strategy` = names(p),
             Lags = adf$Lags[p],
             Conclusion = ifelse(`DF p-value`<.05, "Stationary", "Unit roots")) %>%
      select(Test, `Lag strategy`, Lags, Statistic=`DF test`, `p-value`=`DF p-value`, Conclusion)
  }


  # PHILLIPS-PERRON TESTS -------------------------------------------------
  if (type!="none" & "pp" %in% tests) {
    model = type
    pp_short = ur.pp(y, type="Z-tau", model=model, lags="short")
    pp_long = ur.pp(y, type="Z-tau", model=model, lags="long")

    # Collect PP tests
    pps = tibble(
      Test = "Phillips-Perron",
      `Lag strategy` = c("Short", "Long"),
      Lags = c(attr(pp_short, "lag"), attr(pp_long, "lag")),
      Statistic = c(attr(pp_short, "teststat"), attr(pp_long, "teststat")),
      `Critical value` = c(attr(pp_short, "cval")[2], attr(pp_long, "cval")[2])
    )
    pps$Conclusion = ifelse(abs(pps$Statistic)>abs(pps$`Critical value`),
                            "Stationary", "Unit roots")
    urs[[length(urs)+1]] = pps
  }


  # DF-GLS TESTS ----------------------------------------------------------
  if (type!="none" & "dfgls" %in% tests) {
    model = type
    dfgls = ur.ers(y, model=model, lag.max=pmax)
    dfglss = tibble(
      Test = "DF-GLS",
      `Lag strategy` = "Manual",
      Lags = attr(dfgls, "lag"),
      Statistic = attr(dfgls, "teststat"),
      `Critical value` = attr(dfgls, "cval")[2]
    )
    dfglss$Conclusion = ifelse(abs(dfglss$Statistic)>abs(dfglss$`Critical value`),
                               "Stationary", "Unit roots")
    urs[[length(urs)+1]] = dfglss
  }


  # KPSS TESTS ------------------------------------------------------------
  if (type!="none" & "kpss" %in% tests) {
    model = ifelse(type=="trend", "tau", "mu")
    kpss_short = ur.kpss(y, type=model, lags="short")
    kpss_long = ur.kpss(y, type=model, lags="long")

    # Collect KPSS tests
    kpsss = tibble(
      Test = "KPSS",
      `Lag strategy` = c("Short", "Long"),
      Lags = c(attr(kpss_short, "lag"), attr(kpss_long, "lag")),
      Statistic = c(attr(kpss_short, "teststat"), attr(kpss_long, "teststat")),
      `Critical value` = c(attr(kpss_short, "cval")[2], attr(kpss_long, "cval")[2])
    )
    kpsss$Conclusion = ifelse(abs(kpsss$Statistic)>abs(kpsss$`Critical value`),
                              "Unit roots", "Stationary")
    urs[[length(urs)+1]] = kpsss
  }


  # COMBINE ALL TESTS -----------------------------------------------------
  if (type=="none") out = urs[[1]] %>% remove_rownames()
  if (type!="none") {
    out = bind_rows(urs) %>%
      select(1,2,3,4,5,7,6) %>%
      remove_rownames()
  }

  # Display output
  if (format!="none") {
    out1 = out %>%
      mutate(Test = ifelse(lag(Test)==Test & !is.na(lag(Test)), "", Test))
    out2 = kable(out1, format, digits=3, caption="Unit Root Test Results")
    out2 = gsub("Table: ", "", out2)
    out2 = gsub("NA", "  ", out2)
    out2[length(out2)+1] = paste("\nObservations:", t)
    print(out2)
  }
  invisible(out)

}

phi_tests (y, lag.strategy, …)

y = a vector or time series object

lag.strategy = the strategy for choosing lags; either “p” (for smallest lags with white noise residuals), “aic”, “bic”, or “diffs” (first significant differences)

phi_tests(df$unemp, lag.strategy="p", format="html")
Unit root tests selected with smallest lags
Lags \(D_t\) Parameter \(H_0\) Statistic Result
0 c + dt tau_tau g0 = 0 -3.37 Unit roots
0 c + dt phi3 g = d = 0 5.82 No deterministic trend
0 c tau_mu g = 0 -3.39* Stationary

Conclusion: Series is stationary

#' ADF tests with uncertain deterministics
#'
#' Conducts augmented Dickey-Fuller (ADF) tests for unit roots.
#'
#' @param y a vector or `ts` object
#' @param pmax the maximum lag to test; if NULL, `pmax` is automatically chosen with 12*(t/100)^(.25)
#' @param lmtestorder the order for the Lagrange multiplier tests for serial correlation; if NULL, `pmax` is used
#' @param lag.strategy the strategy for choosing lags `"p"` (for smallest lags with white noise residuals), `"aic"`, `"bic"`, or `"diffs"` (first significant differences)
#' @param format argument to be passed to knitr::kable for displaying the results output. Possible values are `"latex"`, `"html"`, `"pipe"`, `"simple"`, `"rst"`, and `"none"`. If `"none"` is selected, no output is displayed in the console.
#' @import tidyverse lmtest knitr
#' @export


phi_tests = function(y, pmax=NULL, lmtestorder=NULL, lag.strategy=c("p", "aic", "bic", "diffs"),
                     format="simple") {

  # Function to select ADF tests at each lag
  select.adf = function(adf) {
    p = c(`p` = which(adf$Lags==min(adf$Lags[adf$`LM p-value`>.05])),
          `aic` = which(adf$AIC==min(adf$AIC[adf$`LM p-value`>.05])),
          `bic` = which(adf$BIC==min(adf$BIC[adf$`LM p-value`>.05])),
          `diffs` = first(which(adf$`Diff p-value`[adf$`LM p-value`>.05] < .10)))
    adf[p,] %>% mutate(`Lag strategy` = names(p), .before=1) %>% remove_rownames()
  }

  # ADF tests
  adf.trend = select.adf(tsatools::adf_tests(y, pmax, lmtestorder, type="trend", format="none"))
  adf.drift = select.adf(tsatools::adf_tests(y, pmax, lmtestorder, type="constant", format="none"))

  # Lag strategy
  p = which(lag.strategy==c("p", "aic", "bic", "diffs"))
  p.name = c("smallest lags", "AIC", "BIC", "first significant differences")[p]

  # Initialize objects
  tt = list(reject=FALSE)
  phi3 = list(reject=FALSE)
  phi2 = list(reject=FALSE)
  tu = list(reject=FALSE)
  phi1 = list(reject=FALSE)

  # Test tau_tau
  tt = list(
    lags = adf.trend$Lags[p],
    statistic = adf.trend$`DF test`[p],
    p.value = adf.trend$`DF p-value`[p],
    reject = (adf.trend$`DF p-value`[p] < .05),
    result = ifelse(adf.trend$`DF p-value`[p] < .05, "Stationary", "Unit roots")
  )

  # Test phi3
  if (!tt$reject) {
    phi3 = list(
      lags = adf.trend$Lags[p],
      statistic = adf.trend$Phi3[p],
      cv = adf.trend$`Phi3 CV`[p],
      reject = (adf.trend$Phi3[p] > adf.trend$`Phi3 CV`[p]),
      result = ifelse(adf.trend$Phi3[p] > adf.trend$`Phi3 CV`[p],
                      "Deterministic trend is present", "No deterministic trend")
    )
  }

  # Test phi2
  if (!tt$reject & phi3$reject) {
    phi2 = list(
      lags = adf.trend$Lags[p],
      statistic = adf.trend$Phi2[p],
      cv = adf.trend$`Phi2 CV`[p],
      reject = (adf.trend$Phi2[p] > adf.trend$`Phi2 CV`[p]),
      result = ifelse(adf.trend$Phi2[p] > adf.trend$`Phi2 CV`[p],
                      "Drift is present", "No drift is present")
    )
  }

  # Test tau_mu
  if (!tt$reject & !phi3$reject) {
    tu = list(
      lags = adf.drift$Lags[p],
      statistic = adf.drift$`DF test`[p],
      p.value = adf.drift$`DF p-value`[p],
      reject = (adf.drift$`DF p-value`[p] < .05),
      result = ifelse(adf.drift$`DF p-value`[p] < .05, "Stationary", "Unit roots")
    )
  }

  # Test phi1
  if (!tt$reject & !phi3$reject & !tu$reject) {
    phi1 = list(
      lags = adf.drift$Lags[p],
      statistic = adf.drift$Phi1[p],
      cv = adf.drift$`Phi1 CV`[p],
      reject = (adf.drift$Phi1[p] > adf.drift$`Phi1 CV`[p]),
      result = ifelse(adf.drift$Phi1[p] > adf.drift$`Phi1 CV`[p],
                      "Drift is present", "No drift is present")
    )
  }

  # Final result
  conc = dplyr::case_when(
    tt$reject ~ "stationary",
    !tt$reject & phi3$reject & !phi2$reject ~ "unit root, no drift or trend",
    !tt$reject & phi3$reject & phi2$reject ~ "unit root with drift and trend",
    tu$reject ~ "stationary",
    phi1$reject ~ "unit root with drift",
    TRUE ~ "unit root, no drift or trend"
  )

  # Put results together
  tests = list(`tau_tau`=tt, `phi3`=phi3, `phi2`=phi2, `tau_mu`=tu, `phi1`=phi1)

  # Prepare summary table
  out = tests %>%
    bind_rows(.id="Parameter") %>%
    mutate(Dt = c(rep("c + dt", 3), rep("c", 2)),
           H0 = c("g0 = 0", "g = d = 0", "g = c = d = 0", "g = 0", "g = c = 0"),
           Statistic = paste0(sprintf("%.2f", round(statistic, 2)),
                              ifelse(reject, "*", " "))) %>%
    select(Lags=lags, Dt, Parameter, H0, Statistic, Result=result) %>%
    na.omit()

  # Add summary info to results
  tests = keep(tests, ~length(.x)>1)
  tests$summary = data.frame(out)
  tests$result = conc

  # Display output
  if (format!="none") {
    if (format %in% c("latex", "html")) {
      names(out) = c("Lags", "$D_t$", "Parameter", "$H_0$", "Statistic", "Result")
    }
    out1 = out %>%
      kable(format=format, align="llllcl",
            caption=paste("Unit root tests selected with", p.name))
    print(gsub("Table: ", "", out1))
    cat("Conclusion: Series is", conc)
  }
  invisible(tests)

}


3. Determine the appropriate modeling strategy

The methods from here on require all series to be stationary, and they assume weak exogeneity. For weak exogeneity to hold, the parameters of interest must only be a function of parameters in the conditional model, without any cross-equation restrictions. The dependent variable must only be a function of lagged values, not contemporaneous values, of the independent variables of interest.

Dependent variable Independent variables Weak exogeneity Model strategy
Stationary All stationary Plausible ADL (better for paring down model) in Step 4 or
GECM (better for estimating error correction) in Step 4
Stationary At least one has unit roots Plausible Difference the unit root series, then go back to row 1
Unit root All stationary Plausible Reconsider theory – long-run relationships not likely
Unit root At least one has unit roots Plausible Not covered here – need cointegration methods
(Anything) (Anything) Not plausible Not covered here – need multi-equation models


4. Fit a general model

Use dynlm in the dynlm package to estimate a general model with as many lags \(p\) of the dependent variable and as many lags \(q\) of the independent variables as could plausibly be needed to capture the dynamics. You can use rename_coefs to make the coefficient names more readable.

Model Description
ADL(p,q;n)
\(p=\) lags of \(y\)
\(q=\) lags of each \(x_j\)
\(n=\) number of \(x\) variables
Autoregressive distributive lag (ADL) models regress the dependent variable at time \(t\), \(y_t\), as a function of \(p\) past values of itself \(y_{t-i}\), contemporaneous (current) values of the \(n\) independent variables \(x_{jt}\), and \(q\) past values of the independent variables \(x_{j,t-i}\).
\(\begin{aligned} y_t &= \alpha_0 + \sum_{i=1}^p \alpha_i y_{t-i} + \sum_{j=1}^n \sum_{i=0}^q \beta_{ji} x_{j,t-i} + \varepsilon_t \\ \end{aligned}\)
GECM(p,q;n)
\(p-1=\) lags of \(y\)
\(q-1=\) lags of each \(x_j\)
\(n=\) number of \(x\) variables
Generalized error correction models (GECMs) are mathematically equivalent to ADL models. They regress the change in the dependent variable at each time from the previous time, \(\Delta y_t\), as a function of its last value \(y_{t-1}\), the past \(p-1\) changes in its values \(\Delta y_{t-i}\), the last values of the \(n\) independent variables \(x_{j,t-1}\), contemporaneous changes in the independent variables \(\Delta x_{jt}\), and \(q-1\) past changes in the independent variables \(\Delta x_{j,t-i}\).
\(\begin{aligned} \Delta y_t &= \alpha_0 + \alpha_1^* y_{t-1} + \sum_{i=1}^{p-1} \alpha_{i+1}^* \Delta y_{t-i} + \sum_{j=1}^n \beta_{j0}^* \Delta x_{jt} + \sum_{j=1}^n \sum_{i=1}^{q-1} \gamma_{ji}^* \Delta x_{j,t-i} + \sum_{j=1}^n \beta_{j1}^* x_{j,t-1} + \varepsilon_t \\ \end{aligned}\)

rename_coefs (mod, …)

mod = a lm object from dynlm

key = a named character vector for renaming coefficients

latex = logical: whether to include latex notation

# Time series object
df.ts = ts(df, start=c(1978,1), freq=12)

# ADL(p=2, q=2; n=5)
adl1 = dynlm(app ~ L(app,1:2) + lei + L(lei,1:2) + cei + L(cei,1:2) + 
               dipc + L(dipc,1:2) + infl + L(infl,1:2) + unemp + L(unemp,1:2) + 
               Reagan + BushI + Clinton + BushII + Obama + Trump + Honey + Inaug +  
               RAss + IraqKuwait + Desert + Lehman + Sept11 + Mueller  + Impeach1 + 
               Impeach2 + CovidP + CovidS, df.ts)

# GECM(p=2, q=2; n=5)
gecm1 = dynlm(d(app) ~ L(app) + L(d(app)) + L(d(lei),0:1) + L(lei) + 
                L(d(cei),0:1) + L(cei) + L(d(dipc),0:1) + L(dipc) + 
                L(d(infl),0:1) + L(infl) + L(d(unemp),0:1) + L(unemp) + 
                Reagan + BushI + Clinton + BushII + Obama + Trump + Honey + Inaug +  
                RAss + IraqKuwait + Desert + Lehman + Sept11 + Mueller  + Impeach1 + 
                Impeach2 + CovidP + CovidS, df.ts)

# Key with variable names
covars = c(app="Presidential Approval",
           lei="Leading Economic Index",
           cei="Coincident Economic Index",
           dipc="Disposable Income per capita",
           infl="Inflation rate",
           unemp="Unemployment rate")

# Rename coefficients
adl1b = rename_coefs(adl1)
adl1c = rename_coefs(adl1, key=covars, latex=TRUE)
gecm1b = rename_coefs(gecm1)
gecm1c = rename_coefs(gecm1, key=covars, latex=TRUE)

# Display different coefficients
kable(tibble(
  `Original coefficient names` = names(coef(gecm1)),
  `rename_coefs(mod)` = names(coef(gecm1b)),
  `rename_coefs(mod, key=covars, latex=TRUE)` = names(coef(gecm1c)),
))
Original coefficient names rename_coefs(mod) rename_coefs(mod, key=covars, latex=TRUE)
(Intercept) (Intercept) (Intercept)
L(app) app (t-1) Presidential Approval \((t-1)\)
L(d(app)) d app (t-1) \(\Delta\) Presidential Approval \((t-1)\)
L(d(lei), 0:1)zoo(coredata(x), tt).1 d lei (t) \(\Delta\) Leading Economic Index \((t)\)
L(d(lei), 0:1)zoo(coredata(x), tt).2 d lei (t-1) \(\Delta\) Leading Economic Index \((t-1)\)
L(lei) lei (t-1) Leading Economic Index \((t-1)\)
L(d(cei), 0:1)zoo(coredata(x), tt).1 d cei (t) \(\Delta\) Coincident Economic Index \((t)\)
L(d(cei), 0:1)zoo(coredata(x), tt).2 d cei (t-1) \(\Delta\) Coincident Economic Index \((t-1)\)
L(cei) cei (t-1) Coincident Economic Index \((t-1)\)
L(d(dipc), 0:1)zoo(coredata(x), tt).1 d dipc (t) \(\Delta\) Disposable Income per capita \((t)\)
L(d(dipc), 0:1)zoo(coredata(x), tt).2 d dipc (t-1) \(\Delta\) Disposable Income per capita \((t-1)\)
L(dipc) dipc (t-1) Disposable Income per capita \((t-1)\)
L(d(infl), 0:1)zoo(coredata(x), tt).1 d infl (t) \(\Delta\) Inflation rate \((t)\)
L(d(infl), 0:1)zoo(coredata(x), tt).2 d infl (t-1) \(\Delta\) Inflation rate \((t-1)\)
L(infl) infl (t-1) Inflation rate \((t-1)\)
L(d(unemp), 0:1)zoo(coredata(x), tt).1 d unemp (t) \(\Delta\) Unemployment rate \((t)\)
L(d(unemp), 0:1)zoo(coredata(x), tt).2 d unemp (t-1) \(\Delta\) Unemployment rate \((t-1)\)
L(unemp) unemp (t-1) Unemployment rate \((t-1)\)
Reagan Reagan (t) Reagan \((t)\)
BushI BushI (t) BushI \((t)\)
Clinton Clinton (t) Clinton \((t)\)
BushII BushII (t) BushII \((t)\)
Obama Obama (t) Obama \((t)\)
Trump Trump (t) Trump \((t)\)
Honey Honey (t) Honey \((t)\)
Inaug Inaug (t) Inaug \((t)\)
RAss RAss (t) RAss \((t)\)
IraqKuwait IraqKuwait (t) IraqKuwait \((t)\)
Desert Desert (t) Desert \((t)\)
Lehman Lehman (t) Lehman \((t)\)
Sept11 Sept11 (t) Sept11 \((t)\)
Mueller Mueller (t) Mueller \((t)\)
Impeach1 Impeach1 (t) Impeach1 \((t)\)
Impeach2 Impeach2 (t) Impeach2 \((t)\)
CovidP CovidP (t) CovidP \((t)\)
CovidS CovidS (t) CovidS \((t)\)
#' Rename covariates from ADL and GECM models
#'
#' Makes coefficient names more readable.
#'
#' @param mod a `dynlm` object
#' @param key a named character vector for renaming coefficients, where the name of each element is the name of a variable used in the code for the model and the value of the element is the display name to replace the code name. (For example, `c(infl="Inflation rate")` replaces `infl` with `Inflation rate`.)
#' @param latex, logical: whether to include latex notation ($\\\\Delta$ instead of `d` and $t-1$ instead of `t-1`). Recommended if the output format is html or latex.
#' @import stringr
#' @export


rename_coefs = function(mod, key=NULL, latex=FALSE) {

  # Store variable info in data frame
  x = data.frame(var = names(mod$coefficients)) %>%
    mutate(var = str_remove(var, "zoo\\(coredata\\(x\\), tt\\)\\."),
           lag = str_detect(var, "L\\("),
           diff = str_detect(var, "d\\("),
           name = str_replace_all(var, ".*\\(|,.*|\\).*", "") %>%
             str_replace("Intercept", "\\(Intercept\\)")) %>%

    # Keep track of which lag is which
    mutate(lags = ifelse(grepl(",", var), str_replace_all(var, ".*\\, +|c\\(|\\).*", ""), NA),
           i = as.numeric(ifelse(grepl(",", var), str_extract(var, "[0-9]+$"), NA)),
           i0 = as.numeric(purrr::map(lags, ~first(unlist(str_split(.x, "\\:"))))),
           i1 = as.numeric(purrr::map(lags, ~last(unlist(str_split(.x, "\\:")))))) %>%
    rowwise() %>%
    mutate(h = ifelse(!is.na(lags), c(i0:i1)[i], NA),
           h = ifelse(lag & is.na(h), 1, h)) %>%

    # Paste together information into final variable names
    mutate(final = name,
           final = ifelse(diff, paste("d", final), final),
           final = ifelse(lag, paste0(final, " (t-", h, ")"), final),
           final = ifelse(lag & h==0, str_remove(final, "-0"), final),
           final = ifelse(final!="(Intercept)" & !grepl("\\)", final), paste(final, "(t)"), final))

  # Store variable names as vector
  x1 = x$final

  # Rename variables
  if(!is.null(key)) {
    for (i in 1:length(key)) x1 = str_replace(x1, names(key)[i], key[i])
  }

  # Prepare for latex
  if (latex) {
    x1 = ifelse(grepl("^d ", x1), gsub("^d ", "$\\\\Delta$ ", x1), x1) %>%
      str_replace("\\(t", "$\\(t") %>%
      str_replace("(?<=[0-9])\\)|(?<=\\(t)\\)", "\\)$")
  }

  # Return model
  names(mod$coefficients) = x1
  mod

}


5. Perform diagnostics

Check that the models satisfy the following assumptions:

  1. Linear and constant parameters

  2. Dynamic stability

  3. Zero conditional mean

  4. No serial correlation

  5. No heteroscedasticity

The mod_checks function conducts tests relating to all of these assumptions except #3. For #4, it tests for serial correlation using lmtest::bgtest with orders 6, 12, 18, and 24 by default and reports whether serial correlation is detected at any of these orders. You can also use bg_tests to to see more information on the serial correlation tests at each of these orders.

mod_checks (mod, y, data, orders)

mod = a dynlm object

y = the name of the dependent variable

data = the data frame or matrix used in the model

orders = an integer vector of maximal orders of serial correlation to be tested

mod_checks(list(ADL=adl1, GECM=gecm1), y="app", data=df.ts) %>% kable()
Test ADL GECM
Ramsey’s RESET test 0.586 0.746
CUSUM test 0.601 0.601
Stationary coefficients TRUE TRUE
Serial correlation present FALSE FALSE
B-P test for heteroscedasticity 208.81 ** 208.81 **
#' Diagnostic checks for time series models
#'
#' Conducts tests for model misspecification with `lmtest::resettest`, empirical fluctuation with `strucchange::efp` and `strucchange::sctest`, stationarity with the model coefficients, serial correlation with `lmtest::bgtest`, and heteroscedasticity with `lmtest::bptest`.
#'
#' @param mod a `dynlm` object
#' @param y a character with the name of the dependent variable
#' @param data the data frame or matrix used in the model
#' @param orders an integer vector of maximal orders of serial correlation to be tested
#' @import tidyverse lmtest strucchange
#' @export


mod_checks = function(mods, y, data, orders=c(6,12,18,24)) {

  # Function to check for stationarity
  station = function(coefs, y) {
    a = coefs[grep(y, names(coefs))]
    (sum(a)<1) & (all(abs(a)<1))
  }

  # Diagnostic tests
  ramsey = purrr::map(mods, ~lmtest::resettest(.x, power=2, type="regressor", data=data))
  efp = purrr::map(mods, ~strucchange::sctest(strucchange::efp(.x, type="OLS-CUSUM", data=data)))
  exog = purrr::map(mods, ~station(.x$coefficients, y))
  serial = purrr::map(mods, ~lapply(orders, function(i) lmtest::bgtest(.x, order=i, type="Chisq")))
  hetero = purrr::map(mods, lmtest::bptest)

  # Print results
  rbind(
    `Ramsey's RESET test` = paste(round(unlist(purrr::map(ramsey, "statistic")), 3),
                                  purrr::map(purrr::map(ramsey, "p.value"), tsatools::psig)),
    `CUSUM test` = paste(round(unlist(purrr::map(efp, "statistic")), 3),
                         purrr::map(purrr::map(efp, "p.value"), tsatools::psig)),
    `Stationary coefficients` = unlist(exog),
    `Serial correlation present` = purrr::map(purrr::map(serial, purrr::map, "p.value"), ~any(.x<.05)),
    `B-P test for heteroscedasticity` = paste(
      round(unlist(purrr::map(hetero, "statistic")), 2),
      purrr::map(purrr::map(hetero, "p.value"), tsatools::psig))
  ) %>% as.data.frame() %>% rownames_to_column("Test")

}

bg_tests (mod, orders, …)

mod = a dynlm object

orders = an integer vector of maximal orders of serial correlation to be tested

bg_tests(adl1, orders=c(6,12,18,24), format="html")
Breusch-Godfrey tests for serial correlation
Order LM test statistic p-value Conclusion
6 3.225 0.780 No serial correlation detected
12 7.729 0.806 No serial correlation detected
18 17.205 0.509 No serial correlation detected
24 25.489 0.380 No serial correlation detected
#' Serial correlation tests
#'
#' Conducts Breusch-Godfrey tests for serial correlation with multiple orders at once. Calls lmtest::bgtest.
#'
#' @param mod a fitted 'lm' object or formula to be passed to lmtest::bgtest
#' @param orders an integer vector of maximal orders of serial correlation to be tested
#' @param type the type of test statistic to be used; must be either "Chisq" or "F"
#' @param format argument to be passed to knitr::kable for displaying the results output. Possible values are `"latex"`, `"html"`, `"pipe"`, `"simple"`, `"rst"`, and `"none"`. If `"none"` is selected, no output is displayed in the console.
#' @import lmtest tibble knitr
#' @export


bg_tests = function(mod, orders, type="Chisq", format="simple") {

  # Make sure arguments work
  type = match.arg(type, choices=c("Chisq", "F"))

  # Conduct BG tests
  tests = lapply(orders, function(i) bgtest(mod, order=i, type=type))
  out = tibble(Order=orders,
               `LM test statistic`=unlist(purrr::map(tests, "statistic")),
               `p-value`=unlist(purrr::map(tests, "p.value")))
  out$Conclusion = ifelse(out$`p-value`<.05, "Serial correlation present",
                          "No serial correlation detected")

  # Display output
  if (format!="none") {
    out1 = kable(out, format=format, digits=3,
                 caption="Breusch-Godfrey tests for serial correlation")
    out1 = gsub("Table: ", "", out1)
    print(out1)
  }
  invisible(out)

}


6. Adjust model

If any assumptions are violated, update the model accordingly. If possible, simplify the model by removing unnecessary lags to make it more efficient. Then, return to Step 5 to assess whether the adjusted model now satisfies the assumptions.

Problem Some solutions
Too many variables Remove unnecessary lags
Ramsey’s RESET test fails Consider adding more lags or variables
Empirical fluctuation Consider adding more variables or interventions
Zero conditional mean fails Use multiple-equation models
Serial correlation Reevaluate model specficiations (lags, variables, breaks, trends, etc.)
Heteroscedasticity Reevaluate model specifications or apply heteroscedastic and autocorrelation consistent standard errors (HAC)


7. Interpret model

Concept Estimator ADLs GECMs
Direct effects of \(X_j\) Impulse multipliers \(\beta_{j0}\) \(\beta_{j0}^*\)
Total effects of \(X_j\) Long-run multipliers (LRMs) lrms(type="adl") in tsatools lrms(type="gecm") in tsatools
Dynamics of effects Impulse response functions (IRFs) irfs in tsatools Not yet available with irfs
Half-life of effects Median lag length Use irfs and lrms Use IRFs and lrms
Duration of effects Mean lag length Use irfs and lrms Use IRFs and lrms
Rate at which \(Y\) returns to equilibrium Error correction rate \(\sum \alpha_i - 1\) \(\sum \alpha_i^*\)
Equilibrium of \(Y\) Long-run equilibrium \(\hat{Y}\) given \(\bar{X}\)’s \(\hat{Y}\) given \(\bar{X}\)’s

irfs (mod, x, y, data, h, cum)

mod = a dynlm object

x = a vector of names of the independent variables to be included

y = the name of the dependent variable

data = the data frame or matrix used for the model, for calculating standard deviations

h = the number of lags to calculate IRFs out to

cum = whether to calculate cumulative IRFs

irfs(adl2, x=variables[-1], y="app", data=df.ts, h=12) %>%
  pivot_longer(!t) %>%
  mutate(name = factor(name, levels=variables[-1], labels=names(variables)[-1])) %>%
  ggplot(aes(x=t, y=value, color=name)) +
  geom_hline(yintercept=0, color="darkgray") +
  geom_line() +
  facet_wrap(~name, nrow=1) +
  labs(x="Months after a shock", y="Response in Approval") +
  scale_x_continuous(breaks=seq(0,12,3), minor_breaks=seq(1,12,1)) +
  morse::theme_morse() +
  theme(legend.position="none")

#' Impulse response functions
#'
#' Calculates impulse response functions (IRFs) from autoregressive distributive lag (ADL) models and generalized error correction models (GECMs). Note: this function only works for independent variables that include all lags from 1 to q.
#'
#' @param mod a `dynlm` object
#' @param xvars a character vector of names of the independent variables to be included
#' @param yvar a character with the name of the dependent variable
#' @param data the data frame or matrix used for the model, for calculating standard deviations
#' @param h the number of lags to calculate IRFs out to
#' @param cum logical: whether to calculate cumulative IRFs
#' @import tidyverse
#' @export


irfs = function(mod, x, y, data, h=10, cum=TRUE) {

  # Function to calculate IRFs for one variable
  irf.x = function(x) {

    # Standard deviation of X
    sd.x = sd(data[,x], na.rm=TRUE)

    # Get coefficients and such
    coefs = coef(mod)
    a = coefs[grep(y, names(coefs))]
    b = coefs[grep(x, names(coefs))]
    p = length(a)
    q = length(b)
    yt = rep(0, h+p+1)

    # Calculate IRFs
    for (t in 1:(h+1)) {
      yt[t+p] = ifelse(t <= q, b[t]*sd.x, 0) + sum(a*yt[(t+p-1):t])
    }

    # Remove leading zeros and multiply by standard deviation
    yt = yt[-c(1:p)]

    # Return cumulative or non-cumulative sum
    if (cum) {cumsum(yt)} else {yt}

  }

  # Apply to all independent variables supplied
  names(x) = x
  cbind(t=0:h, purrr::map_dfc(x, irf.x))

}

lrms (mod, x, y, type, …)

mod = a dynlm object

x = a vector of names of the independent variables to be included

y = the name of the dependent variable

type = type the type of model; either “adl” or “gecm”

lrms(adl2, x=variables[-1], y="app", type="adl", vcov="HC1", format="html")
Long-run effects on ‘app’
Variable LRM SE Test statistic p-value
lei 218.22 (352.35) 0.62 0.2680
cei -637.35 (584.10) -1.09 0.1378
dipc 0.47 (2.37) 0.20 0.4216
infl -104.64* (49.22) -2.13 0.0169
unemp -2.41* (0.90) -2.69 0.0037
#' Long-run multipliers
#'
#' Calculates long-run multipliers (LRMs) from autoregressive distributive lag (ADL) models and generalized error correction models (GECMs).
#'
#' @param mod a `dynlm` object
#' @param x a character vector of names of the independent variables to be included
#' @param y a character with the name of the dependent variable
#' @param type the type of model; either `"adl"` or `"gecm"`
#' @param vcov a character string specifying the type of heteroscedasticity-consistent covariance matrix estimation, to be passed to `sandwich::vcovHC` via `msm::deltamethod`
#' @param digits integer; the number of digits to round results to
#' @import msm sandwich tidyverse knitr
#' @export


lrms = function(mod, x, y, type=c("adl", "gecm"), vcov="const", digits=2, format="simple") {

  # LRM for one X in ADL
  lrm.adl = function(x) {

    # Coefficients
    aj = grep(y, names(mod$coef))
    bj = grep(x, names(mod$coef))
    alphas = mod$coef[aj]
    betas = mod$coef[bj]

    # Estimate LRM
    est = sum(betas)/(1-sum(alphas))

    # Estimate SE
    se = msm::deltamethod(
      formula(paste0("~(",paste0("x", bj, collapse="+"), ")/(1-",
                     paste0("x", aj, collapse="+"), ")")),
      mod$coef, sandwich::vcovHC(mod, type=vcov))

    # Significance
    tval = est/se
    pval = 1-pt(abs(tval), df=mod$df.residual)

    # Output
    c(`lrm`=est, `se`=se, `statistic`=tval, `p.value`=pval)

  }

  # LRM for one X in GECM
  lrm.gecm = function(x) {

    # Coefficients
    aj = grep(y, names(mod$coef))
    bj = which(grepl(x, names(mod$coef)) & grepl("L\\(", names(mod$coef)))
    alphas = mod$coef[aj]
    beta1s = mod$coef[bj]

    # Estimate LRM
    est = -sum(beta1s)/sum(alphas)

    # Estimate SE
    se = msm::deltamethod(
      formula(paste0("~(-",paste0("x", bj, collapse="+"), ")/(",
                     paste0("x", aj, collapse="+"), ")")),
      mod$coef, sandwich::vcovHC(mod, type=vcov))

    # Significance
    tval = est/se
    pval = 1-pt(abs(tval), df=mod$df.residual)

    # Output
    c(`lrm`=est, `se`=se, `statistic`=tval, `p.value`=pval)

  }

  # Prepare output object
  if (type=="adl") {out = purrr::map(x, lrm.adl)} else {out = purrr::map(x, lrm.gecm)}
  names(out) = x

  # Display output
  if (format!="none") {
    out1 = bind_rows(out, .id="Variable") %>%
      mutate(across(lrm:statistic, ~sprintf(paste0("%.", digits, "f"), round(.x,digits))),
             LRM = paste0(lrm, ifelse(p.value<.05, "*", " ")),
             SE = paste0("(", se, ")"),
             `p-value` = sprintf("%.4f", round(p.value,4))) %>%
      select(Variable, LRM, SE, `Test statistic`=statistic, `p-value`) %>%
      kable(format=format, align="lcccr",
            caption=paste0("Long-run effects on '", y, "'"))
    out1 = gsub("Table: ", "", out1)
    print(out1)
  }
  invisible(out)

}


Other functions

Here are some other functions in tsatools that have not already been discussed.

psig (p, …)

p = a numeric vector of probabilities

#' Asterisk generator for p-values
#'
#' Returns asterisks based on level of significance
#'
#' @param p a numeric vector of probabilities
#' @param stars an integer indicating the maximum number of stars to show
#' @param dot logical: if true, returns "." for significance at .05 < p < 0.10
#' @param blank logical: if true, returns "" for insignificant values; if false, returns NA for insignificant values
#' @import dplyr
#' @export
#' @examples
#' psig(c(.03, .0015, .6, .0002, .09))
#' psig(c(.03, .0015, .6, .0002, .09), dot=FALSE)
#' psig(c(.03, .0015, .6, .0002, .09), blank=FALSE)


psig = function(p, stars=2, dot=TRUE, blank=TRUE) {
  dplyr::case_when(
    p < 0.001 & stars>2 ~ "***",
    p < 0.010 & stars>1 ~ "**",
    p < 0.050 ~ "*",
    p < 0.100 & dot ~ ".",
    TRUE ~ ifelse(blank, "", NA_character_)
  )
}
psig(c(.03, .0015, .6, .0002, .09))
## [1] "*"  "**" ""   "**" "."

phi_cv (phi, n, …)

phi = which \(\phi\) is being tested; must be 1, 2, or 3

n = sample size (number of time points)

#' Critical values for phi tests
#'
#' Helps interpret phi tests from ADF procedures for unit roots.
#'
#' @param phi an integer specifying which phi is being tested; must be 1 for \eqn{\phi_1}, 2 for \eqn{\phi_2}, or 3 for for \eqn{\phi_3}
#' @param tstat the test statistic for the phi test
#' @param n sample size (number of time points)
#' @param alpha confidence level; must be .01, .05, or .1
#' @import dplyr
#' @export
#' @examples
#' phi_cv(phi=2, n=150, alpha=.05)


phi_cv = function(phi=c(1,2,3), n, alpha=c(.01, .05, .1)) {

  # Store critical values
  cvals = list(
    phi1 = rbind(c(7.88, 5.18, 4.12), c(7.06, 4.86, 3.94), c(6.7, 4.71, 3.86),
                 c(6.52, 4.63, 3.81), c(6.47, 4.61, 3.79), c(6.43, 4.59,3.78)),
    phi2 = rbind(c(8.21, 5.68, 4.67), c(7.02, 5.13, 4.31), c(6.5, 4.88, 4.16),
                 c(6.22, 4.75, 4.07), c(6.15, 4.71, 4.05), c(6.09, 4.68, 4.03)),
    phi3 = rbind(c(10.61, 7.24, 5.91), c(9.31, 6.73, 5.61), c(8.73, 6.49, 5.47),
                 c(8.43, 6.49, 5.47), c(8.34, 6.3, 5.36), c(8.27, 6.25, 5.34))
  )

  # Get row number based on sample size
  i = dplyr::case_when(n<25 ~ 1, n<50 ~ 2, n<100 ~ 3, n<250 ~ 4, n<500 ~ 5, TRUE ~ 6)

  # Get column number based on alpha level
  j = dplyr::case_when(alpha==.01 ~ 1, alpha==.05 ~ 2, alpha==.1 ~ 3)

  # Return critical value
  cvals[[phi]][i,j]

}
phi_cv(phi=2, n=150, alpha=.05)
## [1] 4.75

phi_sig (tstat, phi, n)

tstat = the test statistic for the phi test

phi = which \(\phi\) is being tested; must be 1, 2, or 3

n = sample size (number of time points)

#' Significance levels for phi tests
#'
#' Helps interpret phi tests from ADF procedures for unit roots.
#'
#' @param tstat the test statistic for the phi test
#' @param phi an integer specifying which phi is being tested; must be 1 for \eqn{\phi_1}, 2 for \eqn{\phi_2}, or 3 for for \eqn{\phi_3}
#' @param n sample size (number of time points)
#' @import dplyr
#' @export
#' @examples
#' phi_sig(tstat=3.654, phi=2, n=150)


phi_sig <- function(tstat, phi=c(1,2,3), n) {
  dplyr::case_when(
    tstat < tsatools::phi_cv(phi=phi, n=n, alpha=.1) ~ "",
    tstat < tsatools::phi_cv(phi=phi, n=n, alpha=.05) ~ " .",
    tstat < tsatools::phi_cv(phi=phi, n=n, alpha=.01) ~ " *",
    TRUE ~ " **"
  )
}
phi_sig(tstat=3.654, phi=2, n=150)
## [1] ""

vecm_se (cajo, rls)

cajo = a ca.jo object

rls = a cajorls object

#' Standard errors for VECMs
#'
#' Calculates standard errors for coefficients from vector error correction models (VECMs) fitted with `urca::ca.jo`
#'
#' @param cajo a `ca.jo` object
#' @param rls a `cajorls` object
#' @import urca
#' @export


vecm_se = function(cajo, rls) {

  # Get coefficients and info
  alpha <- coef(rls$rlm)[1,]
  beta <- rls$beta
  resids <- resid(rls$rlm)
  N <- nrow(resids)

  # Calculate SEs for alphas
  sigma <- crossprod(resids)/N
  alpha.se <- sqrt(solve(crossprod(cbind(cajo@ZK %*% beta,
                                         cajo@Z1)))[1, 1] * diag(sigma))
  alpha.t <- alpha/alpha.se
  alphas = data.frame(alpha, alpha.se, alpha.t)

  # Calculate SEs for betas
  beta.se <- c(NA, sqrt(diag(kronecker(solve(crossprod(cajo@RK[,-1])),
                                       solve(t(alpha) %*% solve(sigma) %*% alpha)))))
  beta.t <- suppressWarnings(beta[-1]/beta.se)
  betas = data.frame(beta, beta.se, beta.t)

  # Output
  list(`alphas`=alphas, `betas`=betas)

}
df2 = df %>% select(app:unemp) %>% ts()
johansen = urca::ca.jo(df2, type="trace", ecdet="const", spec="transitory")
vecm = cajorls(johansen, r=1)
vecm_se(johansen, vecm)
## $alphas
##                 alpha     alpha.se   alpha.t
## app.d   -1.728792e-03 9.501757e-04 -1.819445
## lei.d    7.560073e-06 1.371622e-06  5.511774
## cei.d    1.278795e-05 1.168288e-06 10.945891
## dipc.d  -1.205518e-03 3.932669e-04 -3.065395
## infl.d  -8.199893e-06 7.126585e-06 -1.150606
## unemp.d  3.333563e-04 8.916110e-05  3.738809
## 
## $betas
##                  ect1     beta.se       beta.t
## app.l1        1.00000          NA           NA
## lei.l1    10151.63298 1391.410142 -50.89566207
## cei.l1   -70816.74039 2803.259126   0.03334987
## dipc.l1      93.48834    8.634610  91.27513004
## infl.l1     788.12515  180.738865  -0.08358041
## unemp.l1    -15.10623    3.868839  25.93896768
## constant    100.35369   25.377966 400.01759076