# Author: Jarad Niemi
# Date: 2024-01-24
# Purpose: Demonstrate data analysis pipeline using the warpbreaks data set as
# an example. It is also intended to show good coding practices.
#-------------------------------------------------------------------------------
library("tidyverse")
# Import
# `warpbreaks` already exists as a data set in R. If you want more details
# about the data set, run
# ?warpbreaks
#-------------------------------------------------------------------------------
# Transform data for better plotting
#-------------------------------------------------------------------------------
<- warpbreaks |>
d mutate(
tension = fct_recode(tension,
Low = "L",
Medium = "M",
High = "H"
),
# These are completely made up
wool = fct_recode(wool,
Lambswool = "A",
`Merino Wool` = "B" # not a valid name, so we need backticks ` `
)
|>
) rename(
Tension = tension,
Wool = wool,
Breaks = breaks
)
# Exploratory statistics
#-------------------------------------------------------------------------------
# Calculate confidence intervals for use in the graphic below
<- d |>
summaries group_by(Wool, Tension) |>
summarize(
n = n(),
mean = mean(Breaks),
sd = sd(Breaks),
.groups = "drop"
|>
) mutate(
lcl = mean - qt(.975, df = n-1) * sd / sqrt(n),
ucl = mean + qt(.975, df = n-1) * sd / sqrt(n),
)
# For consistency across plots, use these values
<- 0.1
jitterwidth <- 0.1
dodgewidth
# Graphical statistics
#-------------------------------------------------------------------------------
ggplot(summaries,
# These values get used by geom_point, geom_pointrange, and geom_line below
aes(x = Tension,
y = mean,
color = Wool)) +
# Add raw data points
geom_point(
data = d,
aes(y = Breaks,
shape = Wool),
position =
# To avoid overlapping points
position_jitterdodge(
jitter.width = jitterwidth, # adds random noise
dodge.width = dodgewidth)) + # puts wool type slight left and right of tension values
# Include means and confidence intervals
geom_pointrange(
aes(
shape = Wool,
ymax = ucl,
ymin = lcl),
position = position_dodge(width = dodgewidth)
+
)
# Add lines to connect the means
geom_line(
aes(
linetype = Wool,
group = Wool),
position = position_dodge(width = dodgewidth)
+
)
labs(
y = "Number of Breaks per Loom",
title = "Raw data with means: Lambswool v Merino Wool"
+
)
# Remove gray background with white grid and replace with the opposite
theme_bw()
# Modeling
#-------------------------------------------------------------------------------
# Multiple regression model with number of breaks as the response and
# explanatory variables: wool, tension, wool-tension interaction.
<- lm(Breaks ~ Wool + Tension + Wool:Tension,
m data = d)
# Test if interaction is significant
anova(m)
# Leave the interaction out, even though it was significant in this case
<- lm(Breaks ~ Wool + Tension, # No interaction
m data = d)
# Use predictive means for plotting purposes
<- d |>
nd select(Wool, Tension) |>
unique() # only keep unique combinations of wool and tension
<- predict(m, newdata = nd, interval = "confidence")
p
# Predicted values and confidence intervals are in the same order as the nd
<- bind_cols(nd, p)
predictions
# Visualize predictions from this multiple regression model with no interaction
# between wool and tension
ggplot(predictions,
aes(x = Tension,
y = fit,
color = Wool)) +
# Add raw data
geom_point(
data = d,
aes(y = Breaks,
shape = Wool),
position =
# To avoid overlapping points
position_jitterdodge(
jitter.width = jitterwidth, # adds random noise
dodge.width = dodgewidth)) +
# Add predicted points
geom_pointrange(
aes(
y = fit,
shape = Wool,
ymax = upr,
ymin = lwr),
position = position_dodge(width = dodgewidth)
+
)
# Add lines to connect the means
geom_line(
aes(
linetype = Wool,
group = Wool),
position = position_dodge(width = dodgewidth)
+
)
labs(
y = "Number of Breaks per Loom",
title = "No-interaction Model: Lambswool v Merino Wool"
+
)
# Remove gray background with white grid and replace with the opposite
theme_bw()
9 Analysis
Here we have a script that performs much of the data science pipeline. Copy-and-paste this following code into a new script and then run the code. While running the code, feel free to run additional commands in the Console to understand what objects are being created.