9  Analysis

Author

Jarad Niemi

Here we have a script that performs much of the data science pipeline. Copy-and-paste this following code into a new script and then run the code. While running the code, feel free to run additional commands in the Console to understand what objects are being created.

# Author: Jarad Niemi
# Date:   2024-01-24
# Purpose: Demonstrate data analysis pipeline using the warpbreaks data set as
#          an example. It is also intended to show good coding practices. 
#-------------------------------------------------------------------------------
library("tidyverse")

# Import
#   `warpbreaks` already exists as a data set in R. If you want more details
#   about the data set, run 
# ?warpbreaks
#-------------------------------------------------------------------------------

# Transform data for better plotting
#-------------------------------------------------------------------------------
d <- warpbreaks |>
  mutate(
    tension = fct_recode(tension,
      Low    = "L",
      Medium = "M",
      High   = "H"
    ),
    
    # These are completely made up
    wool = fct_recode(wool,
      Lambswool     = "A",
      `Merino Wool` = "B"    # not a valid name, so we need backticks ` `
    )
    
  ) |>
  rename(
    Tension = tension,
    Wool    = wool,
    Breaks  = breaks
  )


# Exploratory statistics 
#-------------------------------------------------------------------------------
# Calculate confidence intervals for use in the graphic below
summaries <- d |> 
  group_by(Wool, Tension) |>
  summarize(
    n    = n(),
    mean = mean(Breaks),
    sd   = sd(Breaks),
    
    .groups = "drop"
  ) |>
  mutate(
    lcl = mean - qt(.975, df = n-1) * sd / sqrt(n),
    ucl = mean + qt(.975, df = n-1) * sd / sqrt(n),
  )

# For consistency across plots, use these values
jitterwidth <- 0.1
dodgewidth  <- 0.1


# Graphical statistics
#-------------------------------------------------------------------------------
ggplot(summaries, 
       
       # These values get used by geom_point, geom_pointrange, and geom_line below
       aes(x = Tension, 
           y = mean,
           color = Wool)) +
  
  # Add raw data points
  geom_point(
    data = d,
       aes(y = Breaks,
           shape = Wool),
    position = 
               # To avoid overlapping points
               position_jitterdodge(
                 jitter.width = jitterwidth, # adds random noise
                 dodge.width  = dodgewidth)) + # puts wool type slight left and right of tension values
  
  # Include means and confidence intervals
  geom_pointrange(
    aes(
      shape = Wool,
      ymax  = ucl,
      ymin  = lcl),
    position = position_dodge(width = dodgewidth)
  ) +
  
  # Add lines to connect the means
  geom_line(
    aes(
      linetype = Wool,
      group    = Wool),
    position = position_dodge(width = dodgewidth)
  ) +
  
  labs(
    y = "Number of Breaks per Loom",
    title = "Raw data with means: Lambswool v Merino Wool"
  ) +
  
  # Remove gray background with white grid and replace with the opposite
  theme_bw()


# Modeling
#-------------------------------------------------------------------------------
# Multiple regression model with number of breaks as the response and 
#   explanatory variables: wool, tension, wool-tension interaction.
m <- lm(Breaks ~ Wool + Tension + Wool:Tension,
        data = d)

# Test if interaction is significant
anova(m)

# Leave the interaction out, even though it was significant in this case
m <- lm(Breaks ~ Wool + Tension, # No interaction
        data = d)

# Use predictive means for plotting purposes
nd <- d |>
  select(Wool, Tension) |>
  unique()                 # only keep unique combinations of wool and tension

p <- predict(m, newdata = nd, interval = "confidence")

# Predicted values and confidence intervals are in the same order as the nd
predictions <- bind_cols(nd, p)

# Visualize predictions from this multiple regression model with no interaction
#   between wool and tension
ggplot(predictions, 
       aes(x     = Tension,
           y     = fit,
           color = Wool)) + 
  
  # Add raw data
  geom_point(
    data = d,
    aes(y = Breaks,
        shape = Wool),
    position = 
               # To avoid overlapping points
               position_jitterdodge(
                 jitter.width = jitterwidth, # adds random noise
                 dodge.width  = dodgewidth)) +
  
  # Add predicted points
  geom_pointrange(
    aes(
      y     = fit,
      shape = Wool,
      ymax  = upr,
      ymin  = lwr),
    position = position_dodge(width = dodgewidth)
  ) +

  # Add lines to connect the means
  geom_line(
    aes(
      linetype = Wool,
      group    = Wool),
    position = position_dodge(width = dodgewidth)
  ) +
  
  labs(
    y     = "Number of Breaks per Loom",
    title = "No-interaction Model: Lambswool v Merino Wool"
  ) +
  
  # Remove gray background with white grid and replace with the opposite
  theme_bw()