In this document we perform some descriptive analysis of the priors chosen by participants and compare them across different conditions.
First, we load the data downlaoded from qualtrics (this file is not shared as it contains identifiable information) and remove all the identifiers. We then save this deidentified file.
data.raw = read_csv( file = "data/prior-setting-study.csv",
col_types = list())
data.raw %>%
rename(
t3 = `Duration (in seconds)`,
`Confidence in answer` = Q1_1,
`Previous stats analysis` = Q2,
`Stats software` = Q3,
`Knowledge software` = Q4,
`Knowledge stats` = Q5,
`Knowledge Bayesian` = Q6,
Strategy = Q7,
`Affect of choice` = Q8,
Education = Q9,
condition = cond
) %>%
select(t3, ResponseId, `Confidence in answer`:Education, alpha:t2) %>%
extract( 3:nrow(.), ) %>%
mutate(
t1 = as.numeric(t1),
t2 = as.numeric(t2),
t3 = as.numeric(t3),
time = t1 + t2 + t3
) %>%
select (t1, t2, t3, time, everything()) %>%
write.csv( file = "data/prior-setting-study-deidentified.csv", row.names = FALSE )
We load the cleaned data, after removing all the identifiable information. We then add the codes from the open code generation process. The codes that we generated classified the priors elicited from participants in the survey (for both alpha and beta) as uninformative, weakly informative or informative.
data = read_csv( file = "data/prior-setting-study-deidentified.csv",
col_types = list())
data %<>% mutate(
WIP_alpha = c(NA, NA, TRUE, TRUE, FALSE, TRUE, FALSE, NA, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, NA, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, NA, NA, FALSE, TRUE, FALSE, NA, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, NA, NA, NA, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, NA, NA, TRUE, NA),
WIP_beta = c(NA, NA, FALSE, FALSE, TRUE, NA, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, NA, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, NA, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, NA, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, NA, NA, NA, FALSE, TRUE, TRUE),
IP_alpha = c(NA, NA, FALSE, FALSE, TRUE, FALSE, TRUE, NA, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, NA, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, NA, NA, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, NA, FALSE, FALSE, TRUE, FALSE, NA, FALSE, NA, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, TRUE, NA, FALSE, FALSE, NA, FALSE, FALSE, NA),
IP_beta = c(NA, NA, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, NA, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, NA, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, NA, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, NA, NA, NA, FALSE, FALSE, FALSE)
) %>%
drop_na( condition )
data %>%
write.csv( file = "data/prior-setting-study-deidentified-2.csv", row.names = FALSE )
Check the number of responses in each condition.
data %>%
group_by(condition) %>%
summarise(n = n())
## # A tibble: 3 x 2
## condition n
## <chr> <int>
## 1 0c 17
## 2 1t 18
## 3 2t 15
Calculate the amount of time spent by the participants in the survey. This is the combined time spent across the three pages of the survey (page 1: intro and description, prior elicitation, open-ended questions and statistical experience)
data %>%
mutate( time = (as.numeric(t1) + as.numeric(t2) + (as.numeric(t3)) ) ) %>%
summarise(
mean = mean(time),
sd = sd(time)
)
## # A tibble: 1 x 2
## mean sd
## <dbl> <dbl>
## 1 870. 537.
We ask three likert type questions to elicit participants statistical experience: - Experience with statistics in general - Experience with Bayesian statistics - Experience with a statistical programming language (such as R, SAS etc.)
We visualize the responses to these likert items:
data %>%
gather( domain, knowledge_level, c(`Knowledge stats`, `Knowledge Bayesian`, `Knowledge software`)) %>%
mutate(
`knowledge_level` = factor(`knowledge_level`, levels = c("Not knowledgeable at all", "Slightly knowledgeable", "Moderately knowledgeable", "Very knowledgeable", "Extremely knowledgeable") )
) %>%
ggplot() +
geom_bar(aes( knowledge_level )) +
scale_x_discrete( drop=FALSE, labels = function(x) sub(" "," \n", x, fixed=TRUE) ) +
facet_grid(domain ~ .)
We first create a prior data frame which we will use to create visualizations of the prior density on the parameter scale, response scale and prior predictive density. We compute the density of the prior distributions (using the functions: get_density
and get_density_response
) that we have elicited from the participants, and store them in this data frame so that they can be visualized.
priors <- data %>%
select( ResponseId, condition, alpha, beta, WIP_alpha:IP_beta ) %>%
gather( parameter, prior, alpha, beta ) %>%
separate( col = prior, into = c("dist", "mu", "sigma"), sep = "([\\(,\\)])", extra = "drop" ) %>%
mutate(
dist = replace(dist, dist == "N", "normal"),
dist = replace(dist, dist == "t", "student_t")
) %>%
mutate( mu = as.numeric(mu), sigma = as.numeric(sigma) ) %>%
drop_na( condition ) %>%
mutate( index = seq(1:nrow(.)))
get_density <- function( dist, mu, sigma, grid) {
if (dist == "normal") {
dnorm( grid, mu, sigma )
} else if(dist == "student_t") {
gamlss.dist::dTF( grid, mu, sigma, nu = 3)
} else {
NA
}
}
get_density_response <- function( dist, mu, sigma, grid) {
if (dist == "normal") {
dnorm( log(grid), mu, sigma ) * abs( 1/grid)
} else if(dist == "student_t") {
gamlss.dist::dTF(log(grid), mu, sigma, nu = 3) * abs( 1/grid)
} else {
NA
}
}
Next, we visualize: - the probability density, and - the distribution of the location and scale parameters (using histograms) of the priors elicited from the participants for the three different conditions. The visualization interfaces used in our study were: parameter scale density visualization, responsescale density visualization, prior predictive density visualization.
priors.dens.alpha <- priors %>%
filter( parameter == "alpha" ) %>%
mutate( grid = list(seq(-1, 7, by = 0.01)) ) %>%
mutate( density = pmap(list(dist, mu, sigma, grid), ~ get_density(..1, ..2, ..3, ..4)) )
priors.dens.alpha %>%
unnest( cols = c(grid, density) ) %>%
ggplot() +
geom_vline( xintercept = log(35), alpha = 0.5 ) +
geom_line(aes( x = grid, y = density, color = dist, group = ResponseId), alpha = 0.7) +
facet_wrap( ~ condition)
priors.dens_response.alpha <- priors %>%
filter( parameter == "alpha" ) %>%
mutate( grid = list(seq(0.5, 140, by = 0.5)) ) %>%
mutate( density = pmap(list(dist, mu, sigma, grid), ~ get_density_response(..1, ..2, ..3, ..4)) ) %>%
unnest( cols = c(grid, density) )
df.interval = tibble( x = c(24.6, 44.1), y = range(priors.dens_response.alpha$density)) %>%
gather("axis", "value", x, y) %>%
mutate( axis = c("x.lower", "x.upper", "y.lower", "y.upper")) %>%
spread( axis, value)
p1 <- priors.dens_response.alpha %>%
unnest( cols = c(grid, density) ) %>%
ggplot() +
geom_vline( xintercept = 128 ) +
# geom_rect( data = df.interval, aes(xmin = x.lower, xmax = x.upper, ymin = y.lower, ymax = y.upper), fill = "grey50", alpha = 0.3 ) +
geom_line(aes( x = grid, y = density, color = dist, group = ResponseId), alpha = 0.7) +
scale_x_continuous( breaks = seq(0, 128, by = 16) ) +
facet_grid( . ~ condition)
priors %>%
filter( parameter == "alpha" ) %>%
ggplot() +
geom_vline( xintercept = log(35), alpha = 0.5 ) +
geom_histogram(aes( x = mu), color = "#ffffff", binwidth = 0.1) +
scale_y_continuous( breaks = seq(0, 8, by = 2) ) +
facet_wrap( ~ condition)
priors %>%
filter( parameter == "alpha" ) %>%
ggplot() +
geom_histogram(aes( x = sigma), color = "#ffffff", binwidth = 0.1) +
facet_wrap( ~ condition)
priors.dens.beta <- priors %>%
filter( parameter == "beta" ) %>%
mutate( grid = list(seq(-2, 2, by = 0.01)) ) %>%
mutate( density = pmap(list(dist, mu, sigma, grid), ~ get_density(..1, ..2, ..3, ..4)) )
priors.dens.beta %>%
unnest( cols = c(grid, density)) %>%
ggplot() +
geom_vline( xintercept = 0, alpha = 0.5 ) +
geom_line(aes( x = grid, y = density, color = dist, group = ResponseId), alpha = 0.7, size = 1.25) +
facet_wrap( ~ condition)
priors.dens.beta <- priors %>%
filter( parameter == "beta") %>%
mutate( grid = list(seq(-2.5, 2.5, by = 0.01)) ) %>%
mutate( density = pmap(list(dist, mu, sigma, grid), ~ get_density(..1, ..2, ..3, ..4)) )
priors.dens.beta %>%
unnest( cols = c(grid, density) ) %>%
ggplot() +
geom_vline( xintercept = 0, alpha = 0.5 ) +
geom_line(aes( x = grid, y = density, color = dist, group = ResponseId), alpha = 0.3, size = 1.25) +
scale_x_continuous( breaks = log(c(0.1, 0.2, 0.4, 0.7, 1, 1.4, 2, 3, 5, 8)), labels = function(x) round(exp(x), 2) ) +
facet_wrap( ~ condition)
priors %>%
filter( parameter == "beta" ) %>%
ggplot() +
geom_vline( xintercept = 0, alpha = 0.5 ) +
geom_histogram(aes( x = mu, group = ResponseId), alpha = 0.7, binwidth = 0.1) +
facet_wrap( ~ condition)
priors %>%
filter( parameter == "beta" ) %>%
ggplot() +
geom_histogram(aes( x = sigma, group = ResponseId), alpha = 0.7, binwidth = 0.1) +
facet_wrap( ~ condition)
In the next two plots, we also visualize the location and scale of the priors for alpha (5.1) and beta (5.2) to give a sense of the distributions of the location and scale values of these prior distributions in a single plot, as opposed to separate histograms.
priors %>%
filter( parameter == "alpha" ) %>%
ggplot( aes(mu, sigma) ) +
geom_point() +
ggtitle("Scatterplot of the mean and sd of the priors on alpha by condition") +
facet_grid( . ~ condition)
priors %>%
filter( parameter == "beta" ) %>%
ggplot( aes(mu, sigma) ) +
geom_point() +
ggtitle("Scatterplot of the mean and sd of the priors on beta by condition") +
facet_grid( . ~ condition)