R code for data visualization in economics, created and maintained by DIME Analytics.
# Install and load packages ---------------
packages <- c(
"tidyverse",
"tidymodels",
"haven",
"labelled",
"forcats",
"latex2exp",
"grid",
"gridExtra",
"cowplot",
"survey"
)
# Change to install = TRUE to install the required packages
pacman::p_load(packages, character.only = TRUE, install = FALSE)
# Global variable for output
table_chr_size = 2.5
# First dataset
data1 <- read_dta("https://github.com/worldbank/r-econ-visual-library/raw/master/Library/Data/METable2data.dta")
data_varlabel <- unlist(var_label(data1))
data1 <- data1 %>%
mutate(case_3 = ifelse(case == 3, TRUE, FALSE))
var_list <- c("correct", "treat_cxr", "re_3", "re_4",
"med_any", "med_l_any_2", "med_l_any_3", "med_k_any_9")
df_result <- tibble(y_var = var_list) %>%
mutate(
model_logit = map(y_var, ~ svyglm(
as.formula(paste(.x, "~ case_3 + factor(city) + factor(type_formal)")),
design = svydesign(ids = ~1, weights = ~weight_city, data = data1),
family = binomial)
),
tidied_logit = map(model_logit, tidy)
)
df_coef_logit <- df_result %>%
dplyr::select(y_var, tidied_logit) %>%
unnest(cols = tidied_logit) %>%
mutate(lower_bound = exp(estimate - 1.96 * std.error),
upper_bound = exp(estimate + 1.96 * std.error),
OR = exp(estimate),
p_value = 2 * pnorm(- abs(estimate / std.error))) %>%
mutate_at("p_value", ~ format(round(., 3), 3)) %>%
mutate_if(is.double, round, digits = 2) %>%
filter(term == "case_3TRUE") %>%
mutate(y_var = as.factor(data_varlabel[.$y_var])) %>%
mutate(y_var = fct_rev(factor(y_var, levels = data_varlabel[var_list])))
p1 <- ggplot(df_coef_logit, aes(x = y_var, y = OR)) +
geom_linerange(aes(ymin = lower_bound, ymax = upper_bound),
alpha = 0.6, size = 0.5) +
geom_dotplot(binaxis = "y",
stackdir = "center",
dotsize = 0.4
) +
coord_flip(ylim = c(0.01, 1e+7)) +
geom_hline(yintercept = 1, size = 0.1, alpha = 0.5) +
scale_y_log10(breaks = c(0.01, 0.1, 1, 10, 100)) +
scale_x_discrete(expand = c(0.1, 0.1)) +
theme_classic() +
ylab(TeX(str_interp("$\\leftarrow$ Favors Case 1 (N = ${sum(data1$case_3 == FALSE)}) $\\;\\;\\;\\;\\;\\;$ Favors Privte (N = ${sum(data1$case_3 == TRUE)})$\\rightarrow$ \n"))) +
ggtitle("A. Case 1 vs Case 3 in all providers receiving both cases") +
theme(
axis.line = element_blank(),
axis.ticks.y = element_blank(),
legend.title = element_blank(),
axis.title.y = element_blank(),
axis.title.x = element_text(hjust = -0.15, size = 9),
legend.position = "bottom"
)
ci_str <- c()
for (i in seq_along(df_coef_logit$lower_bound)){
ci_str <- c(ci_str, str_interp("[ ${df_coef_logit$lower_bound[i]}, ${df_coef_logit$upper_bound[i]} ]"))
}
dots_xaxis <- ggplot_build(p1)$data[[1]]['x']
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- p1 + annotate("text", x = dots_xaxis$x, y = 500,
label = as.character(df_coef_logit$OR), size = table_chr_size) +
annotate("text", x = dots_xaxis$x, y = 3e+4,
label = ci_str, size = table_chr_size) +
annotate("text", x = dots_xaxis$x, y = 1e+6,
label = df_coef_logit$p_value, size = table_chr_size) +
annotate("text", x = max(dots_xaxis$x) + 0.6, y = 500,
label = "Odds Ratio", size = table_chr_size) +
annotate("text", x = max(dots_xaxis$x) + 0.6, y = 3e+4,
label = "95% CI", size = table_chr_size) +
annotate("text", x = max(dots_xaxis$x) + 0.6, y = 1e+6,
label = "P-value", size = table_chr_size)
# Second dataset
data2 <- read_dta("https://github.com/worldbank/r-econ-visual-library/raw/master/Library/Data/METable2data2.dta")
data_varlabel <- unlist(var_label(data2))
data2 <- data2 %>%
mutate(case_3 = ifelse(case == 3, TRUE, FALSE))
var_list <- c("correct", "treat_cxr", "re_3", "re_4",
"med_any", "med_l_any_2", "med_l_any_3", "med_k_any_9")
df_result <- tibble(y_var = var_list) %>%
mutate(
model_logit = map(
y_var, ~ glm(as.formula(paste(.x, "~ sp4_spur_1")),
data = data2, family = "binomial")
),
tidied_logit = map(model_logit, tidy)
)
df_coef_logit <- df_result %>%
dplyr::select(y_var, tidied_logit) %>%
unnest(cols = tidied_logit) %>%
mutate(
lower_bound = exp(estimate - 1.96 * std.error),
upper_bound = exp(estimate + 1.96 * std.error),
OR = exp(estimate),
p_value = 2 * pnorm(- abs(estimate / std.error))
) %>%
mutate_at("p_value", ~ format(round(., 3), 3)) %>%
mutate_if(is.double, round, digits = 2) %>%
filter(term == "sp4_spur_1") %>%
mutate(y_var = as.factor(data_varlabel[.$y_var])) %>%
mutate(y_var = fct_rev(factor(y_var, levels = data_varlabel[var_list])))
p2 <- ggplot(df_coef_logit, aes(x = y_var, y = OR)) +
geom_linerange(
aes(ymin = lower_bound, ymax = upper_bound),
alpha = 0.6, size = 0.5
) +
geom_dotplot(
binaxis = "y",
stackdir = "center",
dotsize = 0.4
) +
coord_flip(ylim = c(0.01, 1e+7)) +
geom_hline(yintercept = 1, size = 0.1, alpha = 0.5) +
scale_y_log10(breaks = c(0.01, 0.1, 1, 10, 100)) +
scale_x_discrete(expand = c(0.1, 0.1)) +
theme_classic() +
ylab(TeX(str_interp("$\\leftarrow$ Favors Ordinary (N = ${sum(data2$sp4_spur_1 == 1)}) $\\;\\;\\;\\;\\;\\;$ Favors Privte (N = ${sum(data2$sp4_spur_1 == 0)})$\\rightarrow$ \n"))) +
ggtitle("B. SP4 with and without sputum report in Mumbai MBBS+") +
theme(
axis.line = element_blank(),
axis.ticks.y = element_blank(),
legend.title = element_blank(),
axis.title.y = element_blank(),
axis.title.x = element_text(hjust = -0.15, size = 9),
legend.position = "bottom"
)
ci_str <- c()
for (i in seq_along(df_coef_logit$lower_bound)){
ci_str <- c(ci_str, str_interp("[ ${df_coef_logit$lower_bound[i]}, ${df_coef_logit$upper_bound[i]} ]"))
}
dots_xaxis <- ggplot_build(p2)$data[[1]]['x']
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
p2 <- p2 +
annotate(
"text", x = dots_xaxis$x, y = 500,
label = as.character(df_coef_logit$OR), size = table_chr_size
) +
annotate(
"text", x = dots_xaxis$x, y = 3e+4,
label = ci_str, size = table_chr_size
) +
annotate(
"text", x = dots_xaxis$x, y = 1e+6,
label = df_coef_logit$p_value, size = table_chr_size
) +
annotate(
"text", x = max(dots_xaxis$x) + 0.6, y = 500,
label = "Odds Ratio", size = table_chr_size
) +
annotate(
"text", x = max(dots_xaxis$x) + 0.6, y = 3e+4,
label = "95% CI", size = table_chr_size
) +
annotate(
"text", x = max(dots_xaxis$x) + 0.6, y = 1e+6,
label = "P-value", size = table_chr_size
)
grid.arrange(arrangeGrob(p1, p2))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.