Code used for entries in the glossary for the OMbook
This file is simply a cumulative collection of code I wrote to create graphics (or occasionally raw numbers) for entries in the glossary for our book Outcome measures and evaluation in counselling and psychotherapy.
I’m not claiming the code is good, some of it certainly isn’t, but it works. Some of it uses real data which for now I am not making available until I am sure that is safe and OK with others involved in collecting the data.
### this is just the code that creates the "copy to clipboard" function in the code blocks
htmltools::tagList(
xaringanExtra::use_clipboard(
button_text = "<i class=\"fa fa-clone fa-2x\" style=\"color: #301e64\"></i>",
success_text = "<i class=\"fa fa-check fa-2x\" style=\"color: #90BE6D\"></i>",
error_text = "<i class=\"fa fa-times fa-2x\" style=\"color: #F94144\"></i>"
),
rmarkdown::html_dependency_font_awesome()
)
setwd("/media/chris/Clevo_SSD2/Data/MyR/R/distill_blog/test2/_posts/2021-11-09-ombookglossary")
library(tidyverse)
library(janitor)
library(pander)
library(CECPfuns)
library(flextable)
### set theme here
theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = .5),
plot.subtitle = element_text(hjust = .5),
axis.title = element_text(size = 15))
set.seed(12345)
valN <- 350
valMean <- 14
valMin <- 0
valMax <- 40
valSD <- 7
rnorm(valN, valMean, valSD) %>%
as_tibble() %>%
mutate(score = round(value),
### reset out of range scores
score = if_else(score < 0, 0, score),
score = if_else(score > 40, 40, score)) -> tibDat
tibDat %>%
summarise(min = min(score),
lQuart = quantile(score, .25),
mean = mean(score),
median = median(score),
uQuart = quantile(score, .75),
max = max(score),
SD = sd(score)) -> tibSummary
ggplot(data = tibDat,
aes(x = score)) +
geom_histogram(center = TRUE) +
geom_vline(xintercept = tibSummary$mean,
colour = "blue",
size = 2) +
geom_vline(xintercept = tibSummary$median,
colour = "green",
size = 2) +
scale_x_continuous(breaks = seq(0, 40, 2)) +
xlab("Score") +
ylab("Count")
ggsave(filename = "mean.png",
width = 1700,
height = 1470,
units = "px")
[1] 767.6734
mean(tibDat$score2)
[1] 200.9804
median(tibDat$score2)
[1] 171.6222
ggplot(data = tibDat,
aes(x = score2)) +
geom_histogram(center = TRUE) +
geom_vline(xintercept = mean(tibDat$score2),
colour = "blue",
size = 2) +
geom_vline(xintercept = median(tibDat$score2),
colour = "green",
size = 2) +
scale_x_continuous(breaks = seq(0, 800, 50)) +
xlab("Score") +
ylab("Count")
ggsave(filename = "mean2.png",
width = 1700,
height = 1470,
units = "px")
ggsave(filename = "symm.png",
width = 1700,
height = 1470,
units = "px")
standardiseVar <- function(x){
x <- x - mean(x, na.rm = TRUE)
x / sd(x, na.rm = TRUE)
}
tibSymm %>%
mutate(score2 = (5 + score)^3,
score2 = standardiseVar(score2)) -> tibPosSkew
ggplot(data = tibPosSkew,
aes(x = score2)) +
geom_histogram(center = TRUE) +
xlab("Score") +
ylab("Count")
ggsave(filename = "negSkew.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tibDat,
aes(x = score)) +
geom_histogram(center = TRUE) +
geom_vline(xintercept = tibSummary$mean,
colour = "blue",
size = 2) +
geom_vline(xintercept = tibSummary$median,
colour = "green",
size = 2) +
scale_x_continuous(breaks = seq(0, 40, 2)) +
xlab("Score") +
ylab("Count")
tibDat %>%
rename(symm = score,
skew = score2) %>%
pivot_longer(cols = symm:skew, names_to = "dist", values_to = "score") -> tibDatLong
tibDatLong %>%
group_by(dist) %>%
summarise(min = min(score),
lQuart = quantile(score, .25),
mean = mean(score),
median = median(score),
uQuart = quantile(score, .75),
max = max(score),
SD = sd(score)) -> tibSummary2
tibSummary2 %>%
select(dist, ends_with("Quart")) %>%
pivot_longer(cols = ends_with("Quart")) -> tibIQR
ggplot(data = tibDatLong,
aes(x = score)) +
geom_histogram(center = TRUE) +
geom_vline(data = tibSummary2,
aes(xintercept = median),
colour = "green",
size = 2) +
geom_vline(data = tibSummary2,
aes(xintercept = mean),
colour = "red",
size = 1) +
geom_vline(data = tibSummary2,
aes(xintercept = lQuart),
colour = "blue",
size = 2) +
geom_vline(data = tibSummary2,
aes(xintercept = uQuart),
colour = "blue",
size = 2) +
geom_line(data = tibIQR,
aes(y = 5, x = value),
arrow = arrow(ends = "both"),
colour = "blue",
size = 1.2) +
facet_wrap(facets = vars(dist),
nrow = 2,
scales = "free") +
xlab("Score") +
ylab("Count")
ggsave(filename = "IQR.png",
width = 1700,
height = 1470,
units = "px")
tibSummary2 %>%
filter(dist == "symm") -> tibSummary2symm
tibIQR %>%
filter(dist == "symm") -> tibIQRsymm
ggplot(data = filter(tibDatLong, dist == "symm"),
aes(x = score)) +
geom_histogram(center = TRUE) +
geom_vline(data = tibSummary2symm,
aes(xintercept = median),
colour = "green",
size = 2) +
geom_vline(data = tibSummary2symm,
aes(xintercept = mean),
colour = "red",
size = 1) +
geom_vline(data = tibSummary2symm,
aes(xintercept = lQuart),
colour = "blue",
size = 2) +
geom_vline(data = tibSummary2symm,
aes(xintercept = uQuart),
colour = "blue",
size = 2) +
geom_line(data = tibIQRsymm,
aes(y = 5, x = value),
arrow = arrow(ends = "both"),
colour = "blue",
size = 1.2) +
xlab("Score") +
ylab("Count")
ggsave(filename = "IQRsymm.png",
width = 1700,
height = 1470,
units = "px")
tibSummary2 %>%
filter(dist == "skew") -> tibSummary2skew
tibIQR %>%
filter(dist == "skew") -> tibIQRskew
ggplot(data = filter(tibDatLong, dist == "skew"),
aes(x = score)) +
geom_histogram(center = TRUE) +
geom_vline(data = tibSummary2skew,
aes(xintercept = median),
colour = "green",
size = 2) +
geom_vline(data = tibSummary2skew,
aes(xintercept = mean),
colour = "red",
size = 1) +
geom_vline(data = tibSummary2skew,
aes(xintercept = lQuart),
colour = "blue",
size = 2) +
geom_vline(data = tibSummary2skew,
aes(xintercept = uQuart),
colour = "blue",
size = 2) +
geom_line(data = tibIQRskew,
aes(y = 5, x = value),
arrow = arrow(ends = "both"),
colour = "blue",
size = 1.2) +
xlab("Score") +
ylab("Count")
ggsave(filename = "IQRskew.png",
width = 1700,
height = 1470,
units = "px")
valN <- 300
set.seed(12345)
rnorm(valN, 40, 5) %>%
as_tibble() %>%
rename(score = value) -> tibDat1
ggplot(data = tibDat1,
aes(x = score)) +
geom_histogram(center = TRUE) +
geom_vline(xintercept = mean(tibDat1$score),
colour = "blue",
size = 2) +
geom_vline(xintercept = quantile(tibDat1$score, c(.25, .75)),
colour = "green",
size = 2) +
xlab("Score") +
ylab("Count") +
scale_x_continuous(breaks = seq(0, 100, 10),
limits = c(0, 100))
[1] 40.40703
25% 75%
36.87730 43.78248
rnorm(valN, 40, 15) %>%
as_tibble() %>%
rename(score = value) -> tibDat1
ggplot(data = tibDat1,
aes(x = score)) +
geom_histogram(center = TRUE) +
geom_vline(xintercept = mean(tibDat1$score),
colour = "blue",
size = 2) +
geom_vline(xintercept = quantile(tibDat1$score, c(.25, .75)),
colour = "green",
size = 2) +
xlab("Score") +
ylab("Count") +
scale_x_continuous(breaks = seq(0, 100, 10),
limits = c(0, 100))
[1] 41.16123
25% 75%
32.48149 50.16902
set.seed(12345)
c(6, 6, 6, 6, 12, 18, 24, 50, 100, 200, 500, 1000, 1000, 5000, 10000) %>%
as_tibble() %>%
rename(sampSize = value) %>%
mutate(sampID = row_number()) %>%
# group_by(sampID) %>%
# uncount(weights = sampSize, .id = "indID")
rowwise() %>%
mutate(values = list(sample(1:6, sampSize, replace = TRUE))) %>%
ungroup() %>%
unnest_longer(values) -> tibUnif
# tibUnif
ggplot(data = filter(tibUnif, sampSize == 6),
aes(x = values)) +
# geom_histogram(binwidth = 1,
# boundary = 0) +
geom_bar() +
facet_grid(rows = vars(sampID),
cols = NULL,
scales = "free_y") +
scale_y_continuous(breaks = 0:2) +
scale_x_continuous(name = "Observed scores on each die", breaks = 1:6)
ggsave(filename = "dice1.png",
width = 1700,
height = 1470,
units = "px")
tibUnif %>%
filter(sampSize > 6 & sampSize < 200) -> tmpTib
ggplot(data = tmpTib,
aes(x = values)) +
geom_bar(aes(y = ..prop..)) +
facet_grid(rows = vars(sampSize),
cols = NULL,
scales = "fixed") +
scale_x_continuous(name = "Observed scores on each die", breaks = 1:6) +
scale_y_continuous(name = "Proportion", breaks = (0:5)/10)
ggsave(filename = "dice2.png",
width = 1700,
height = 1470,
units = "px")
tibUnif %>%
filter(sampSize > 200) -> tmpTib
ggplot(data = tmpTib,
aes(x = values)) +
geom_bar(aes(y = ..prop..)) +
facet_grid(rows = vars(sampSize),
cols = NULL,
scales = "fixed") +
scale_x_continuous(name = "Observed scores on each die", breaks = 1:6) +
scale_y_continuous(name = "Proportion", breaks = (0:5)/10)
ggsave(filename = "dice3.png",
width = 1700,
height = 1470,
units = "px")
ggsave(filename = "Gauss1.png",
width = 1700,
height = 1470,
units = "px")
throwDice <- function(nThrows, nSides = 6, scoreSides = 1:6){
### function to simulate throwing dice (or anything really)
### defaults to six sided die with scores 1:6 but you can override that
if(nThrows <= 0) {
stop("nThrows must be positive")
}
if(nThrows > 8) {
stop("nThrows must be under 9 (to keep things easy!)")
}
if(nThrows - round(nThrows) > .Machine$double.eps) {
warning("nThrows wasn't integer, rounded to integer")
nThrows <- round(nThrows)
}
newScores <- scoreSides
while(nThrows > 1) {
newScores <- as.vector(outer(newScores, scoreSides, FUN = "+"))
nThrows <- nThrows - 1
}
newScores
}
# throwDice(0)
# throwDice(1)
# throwDice(1.1)
# throwDice(11)
# throwDice(2)
# length(throwDice(2))
# min(throwDice(2))
# max(throwDice(2))
# nThrows <- 3
# throwDice(nThrows)
# length(throwDice(nThrows))
# min(throwDice(nThrows))
# max(throwDice(nThrows))
# nThrows <- 4
# throwDice(nThrows)
# length(throwDice(nThrows))
# min(throwDice(nThrows))
# max(throwDice(nThrows))
1:8 %>%
as_tibble() %>%
rename(nThrows = value) %>%
rowwise() %>%
mutate(score = list(throwDice(nThrows)),
nThrowsFac = factor(nThrows)) %>%
ungroup() %>%
unnest_longer(score) %>%
group_by(nThrowsFac) %>%
mutate(nScores = n()) %>%
ungroup() %>%
group_by(nThrowsFac, score) %>%
summarise(nThrows = first(nThrows),
nScores = first(nScores),
n = n(),
p = n / nScores) %>%
ungroup() -> tibDiceThrows
ggplot(data = tibDiceThrows,
aes(x = score, y = p, colour = nThrowsFac)) +
geom_point() +
geom_line() +
ylab("Probability") +
scale_x_continuous(name = paste0("Total score from n dice (with n from 1 to ",
max(tibDiceThrows$nThrows),
")"),
breaks = seq(2, max(tibDiceThrows$score), 2)) +
scale_colour_discrete(name = "n(throws)") +
theme(axis.text.x = element_text(angle = 70, hjust = 1))
ggsave(filename = "throwingDice.png",
width = 1700,
height = 1470,
units = "px")
tibDiceThrows %>%
filter(nThrows == 6) -> tmpTib
tmpTib %>%
summarise(n = n(),
minScore = min(score),
meanScore = Hmisc::wtd.mean(score, weights = nScores),
maxScore = max(score),
SDScore = sqrt(Hmisc::wtd.var(score, weights = nScores)),
totP = sum(p)) -> tmpTibSumm
tmpTib %>%
mutate(normScore = score - tmpTibSumm$meanScore,
normScore = normScore / 3,
normProb = dnorm(normScore) / 3) -> tmpTib
tmpTib %>%
summarise(meanNormScore = mean(normScore),
SDNormScore = sd(normScore),
sumNormProb = sum(normProb))
# A tibble: 1 × 3
meanNormScore SDNormScore sumNormProb
<dbl> <dbl> <dbl>
1 0 3.03 1.00
ggplot(data = tmpTib,
aes(x = score, y = p)) +
geom_point() +
geom_line() +
ylab("Probability") +
scale_x_continuous(name = "Total score from 6 dice",
breaks = 6:36)
ggsave(filename = "throwing6Dice.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tmpTib,
aes(x = score, y = p)) +
geom_point() +
geom_line() +
geom_point(aes(y = normProb),
colour = "green") +
geom_line(aes(y = normProb),
colour = "green") +
ylab("Probability") +
scale_x_continuous(name = "Total score from 6 dice",
breaks = 2:max(tmpTib$score)) +
scale_colour_discrete(name = "n(throws)")
ggsave(filename = "throwing6DiceWithGaussian.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tmpTib,
aes(x = score, y = p)) +
geom_point() +
geom_line() +
ylab("Probability") +
scale_x_continuous(name = "Total score from 6 dice",
breaks = 2:max(tmpTib$score)) +
scale_colour_discrete(name = "n(throws)")
ggsave(filename = "throwing6Dice.png",
width = 1700,
height = 1470,
units = "px")
set.seed(12345)
nReps <- 50000
c(1:5, seq(10, 50, 10), 100, 200, 300, 400, 500) %>%
as_tibble() %>%
rename(nThrows = value) %>%
rowwise() %>%
mutate(score = list(rbinom(nReps, nThrows, .5))) %>%
ungroup() %>%
unnest_longer(score) %>%
group_by(nThrows, score) %>%
summarise(n = n(),
p = n / nReps,
nThrowsFac = factor(nThrows)) %>%
ungroup() %>%
group_by(nThrows) %>%
mutate(scaledScore = scale(score)) -> tibBinom
ggplot(data = tibBinom,
aes(x = score, y = p, colour = nThrowsFac)) +
geom_point() +
geom_line()
ggsave(filename = "tossingCoins1.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tibBinom,
aes(x = scaledScore, y = p, colour = nThrowsFac)) +
geom_point() +
geom_line()
ggsave(filename = "tossingCoinsScaled.png",
width = 1700,
height = 1470,
units = "px")
pbinom(0:2, 2, .5)
[1] 0.25 0.75 1.00
c(1:5, 10, 20, 50, 100) %>%
as_tibble() %>%
rename(nThrows = value) %>%
rowwise() %>%
mutate(pVals = list(pbinom(0:nThrows, nThrows, .5))) %>%
ungroup() %>%
unnest_longer(pVals) %>%
group_by(nThrows) %>%
mutate(score = row_number() - 1,
nThrowsFac = factor(first(nThrows)),
p = if_else(row_number() == 1, pVals, pVals - lag(pVals)),
meanScore = Hmisc::wtd.mean(score, weights = p),
sdScore = sqrt(Hmisc::wtd.var(score, weights = p, normwt = TRUE)),
stdScore = (score - meanScore) / sdScore) -> tibpBinom
# tibGauss1 %>%
# summarise(max(p)) # .399
#
# tibpBinom %>%
# filter(nThrows == 100) %>%
# summarise(max(p)) # .0796
ggplot(data = tibpBinom,
aes(x = stdScore, y = p, colour = nThrowsFac)) +
geom_point() +
geom_line() +
geom_line(inherit.aes = FALSE,
data = tibGauss1,
aes(x = value, y = p * .0796 / .399)) +
ylab("Probability") +
xlab("Standardised score across tossing n fair coins, heads = 1, tails = 0") +
scale_colour_discrete(name = "n(throws)")
ggsave(filename = "pbinomWithGauss.png",
width = 1700,
height = 1470,
units = "px")
seq(0, 20, length = 5001) %>%
as_tibble() %>%
mutate(NHS = dnorm(value, 8, 2),
HS = dnorm(value, 12, 1)) %>%
pivot_longer(cols = ends_with("HS"), names_to = "Population", values_to = "p") -> tibGauss2
ggplot(data = tibGauss2,
aes(x = value, y = p, colour = Population)) +
geom_line(size = 2, alpha = .8) +
xlab("Observed score") +
ylab("Probability")
ggsave(filename = "Gauss2.png",
width = 1700,
height = 1470,
units = "px")
# tmpDatDir <- "~/internalHDD/Data/CORE/translations/SPA_other/Clara/Our_papers/NClinPaper"
tmpDatDir <- "/media/chris/Clevo_SSD2/Data/CORE/translations/SPA_other/Clara/Our_papers/2020_NClinPaper"
readxl::read_excel(paste0(tmpDatDir, "/", "Muestra_NoClinica_CP180520.xlsx")) %>%
mutate(Gender = recode(Genero,
Masculino = "Male",
Femenino = "Female")) -> tibRawDat
### this is a silly way to get around having to up the na.rm = TRUE clause in the mutate
meanNotNA <- function(x){
mean(x, na.rm = TRUE)
}
medianNotNA <- function(x){
median(x, na.rm = TRUE)
}
varNotNA <- function(x){
var(x, na.rm = TRUE)
}
sdNotNA <- function(x){
sd(x, na.rm = TRUE)
}
minNotNA <- function(x){
min(x, na.rm = TRUE)
}
maxNotNA <- function(x){
max(x, na.rm = TRUE)
}
# tibRawDat %>%
# select(starts_with("COREOM01")) %>%
# corrr::correlate(diagonal = 1) %>%
# pivot_longer(cols = -term) %>%
# filter(term != name) %>%
# arrange(desc(value))
### OK, all recoded
tibRawDat %>%
filter(Excluded == "NO" & !is.na(Genero)) -> tibUseDat
tibUseDat %>%
select(Gender, Edad, starts_with("COREOM01")) %>%
select(-COREOM01_35) %>%
pivot_longer(cols = starts_with("COREOM01"), names_to = "Item", values_to = "Score") %>%
# group_by(Gender, Item) %>%
group_by(Item) %>%
summarise(totN = n(),
nOK = getNOK(Score),
min = minNotNA(Score),
mean = meanNotNA(Score),
median = medianNotNA(Score),
max = maxNotNA(Score),
var = varNotNA(Score),
sd = sdNotNA(Score)) %>%
ungroup() %>%
# filter(min != 0 | max != 4) ## OK full range on all items
arrange(var) %>%
filter(row_number() %in% c(1,2,3, 32, 33, 34)) -> tmpTibSummary
tmpTibSummary %>%
select(Item) %>%
pull() -> tmpVecItems
c("R: plans to end my life",
"R: hurt myself physically...",
"R: threatened or intimidated",
"P: difficulty getting to sleep ...",
"P: thought to blame ...",
"P: felt unhappy") -> tmpVecNames
tibUseDat %>%
select(all_of(tmpVecItems)) -> tmpTibRawScores
tmpTibRawScores %>%
pivot_longer(cols = everything(), names_to = "Item", values_to = "Score") %>%
filter(!is.na(Score)) %>%
mutate(Item = ordered(Item,
levels = tmpVecItems,
labels = tmpVecNames)) -> tmpTibLongScores
tmpTibSummary %>%
mutate(Item = ordered(Item,
levels = tmpVecItems,
labels = tmpVecNames),
meanMinusSD = mean - sd,
meanPlusSD = mean + sd) -> tmpTibSummary
ggplot(data = tmpTibLongScores,
aes(x = Score)) +
ylim(0, 1000) +
geom_bar(width = 1) +
geom_vline(data = tmpTibSummary,
aes(xintercept = mean),
colour = "green") +
facet_wrap(facets = vars(Item), ncol = 3)
ggsave(filename = "Variance1.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tmpTibLongScores,
aes(x = Score)) +
ylim(0, 1000) +
geom_bar(width = 1) +
geom_vline(data = tmpTibSummary,
aes(xintercept = mean),
colour = "green") +
geom_segment(inherit.aes = FALSE,
data = tmpTibSummary,
aes(y = 900, yend = 900,
x = meanMinusSD,
xend = meanPlusSD),
colour = "blue",
lineend = "round",
arrow = arrow(ends = "both",
length = unit(2, "mm"))) +
geom_text(data = tmpTibSummary,
aes(y = 960, x = 4.5, label = paste0("Mean: ",
round(mean, 2),
"\nVar: ",
round(var, 2),
"\nSD: ",
round(sd, 2))),
size = 2.5,
hjust = 1,
vjust = 1) +
facet_wrap(facets = vars(Item), ncol = 3)
ggsave(filename = "Variance2.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tibRawDat,
aes(x = Gender)) +
geom_bar() +
xlab("Gender") +
theme(axis.text = element_text(size = 40),
axis.title = element_text(size = 50)) -> ggGender1
ggplot(data = tibRawDat,
aes(x = Edad)) +
geom_histogram(center = TRUE,
breaks = 18:80) +
xlab("Age") +
theme(axis.text = element_text(size = 40),
axis.title = element_text(size = 50)) -> ggAge1
ggpubr::ggarrange(ggGender1, ggAge1) -> ggArrange1
ggpubr::ggexport(ggArrange1,
filename = "Histogram1.png",
width = 1700,
height = 1470,
units = "px")
[1] "Histogram1%03d.png"
ggplot(data = tibRawDat,
aes(x = Edad)) +
geom_histogram(center = TRUE,
breaks = c(18, 20, 30, 40, 50, 80),
closed = "right") +
xlab("Age") +
scale_x_continuous(breaks = seq(10, 80, 10))
# theme(axis.text = element_text(size = 40),
# axis.title = element_text(size = 50))
ggsave(filename = "HistAge.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tibRawDat,
aes(x = Edad)) +
geom_histogram(aes(y = stat(count) / sum(count)),
center = TRUE,
breaks = c(18, 20, 30, 40, 50, 80),
closed = "right") +
xlab("Age") +
ylab("Proportion") +
scale_x_continuous(breaks = seq(10, 80, 10))
# theme(axis.text = element_text(size = 40),
# axis.title = element_text(size = 50))
ggsave(filename = "HistAge2.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tibRawDat,
aes(x = Edad, fill = Gender)) +
geom_histogram(aes(y = stat(count) / sum(count)),
position = "dodge2",
center = TRUE,
breaks = c(18, seq(20, 80, 10)),
closed = "left") +
xlab("Age") +
ylab("Proportion") +
scale_x_continuous(breaks = seq(20, 80, 5))
ggsave(filename = "HistAgeGendDodge.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tibRawDat,
aes(x = Edad, fill = Gender)) +
geom_histogram(aes(y = stat(count) / sum(count)),
position = "stack",
center = TRUE,
breaks = c(18, seq(20, 80, 10)),
closed = "left") +
xlab("Age") +
ylab("Proportion") +
scale_x_continuous(breaks = seq(20, 80, 5))
ggsave(filename = "HistAgeGendStack.png",
width = 1700,
height = 1470,
units = "px")
tibRawDat %>%
filter(!is.na(Edad)) %>%
mutate(Age = case_when(Edad < 20 ~ "<20",
Edad < 30 ~ "20 to 29",
Edad < 40 ~ "30 to 39",
Edad < 50 ~ "40 to 49",
Edad >= 50 ~ "50 and over"),
Age = ordered(Age,
levels = c("<20",
"20 to 29",
"30 to 39",
"40 to 49",
"50 and over"))) -> tmpTib #%>% select(Edad, Age)
tmpTib %>%
tabyl(Age) %>%
adorn_pct_formatting(digits = 1) %>%
pander(justify = "lrr")
Age | n | percent |
---|---|---|
<20 | 148 | 14.9% |
20 to 29 | 545 | 55.1% |
30 to 39 | 131 | 13.2% |
40 to 49 | 80 | 8.1% |
50 and over | 86 | 8.7% |
ggsave(filename = "BarAge.png",
width = 1700,
height = 1470,
units = "px")
# A tibble: 1 × 5
min lwrQ median uprQ max
<dbl> <dbl> <dbl> <dbl> <dbl>
1 -3.88 -0.664 0.000494 0.663 3.36
ggplot(data = tmpTibGaussian,
aes(y = value)) +
geom_boxplot(fill = "grey") +
ylim(4, 4) +
xlab("") -> ggGaussBox1
ggGaussBox1
ggsave(filename = "GaussBox1.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tmpTibGaussian,
aes(y = value)) +
geom_boxplot(fill = "grey") +
ylim(-4, 4) +
xlab("") +
theme(axis.text = element_text(size = 40),
axis.title = element_text(size = 50)) -> ggGaussBox2
ggGaussBox2
ggplot(data = tmpTibGaussian,
aes(x = value)) +
geom_histogram(aes(y = ..density..),
fill = "grey") +
geom_vline(xintercept = median(tmpTibGaussian$value)) +
geom_line(data = tibGauss1,
aes(x = value, y = p)) +
ylab("Probability") -> ggGaussHistOnX
ggGaussHistOnX
ggsave(filename = "GaussHistOnX.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tmpTibGaussian,
aes(y = value)) +
geom_histogram(aes(x = ..density..),
fill = "grey") +
geom_hline(yintercept = median(tmpTibGaussian$value)) +
geom_line(data = tibGauss1,
aes(y = value, x = p),
orientation = "y") +
ylim(-4, 4) +
xlab("") +
ylab("") +
theme(axis.text = element_text(size = 40),
axis.title = element_text(size = 50)) -> ggGaussHistOnY
ggGaussHistOnY
ggpubr::ggarrange(ggGaussBox2, ggGaussHistOnY,
ncol = 2) -> ggArrange2
ggArrange2
ggpubr::ggexport(ggArrange2,
filename = "BoxAndHist.png",
width = 1700,
height = 1470,
units = "px")
[1] "BoxAndHist%03d.png"
ggplot(data = tibRawDat,
aes(y = Edad, x = Gender, fill = Gender)) +
geom_boxplot(varwidth = TRUE) +
ylab("Age")
ggsave(filename = "AgeByGenderBox.png",
width = 1700,
height = 1470,
units = "px")
tibRawDat %>%
mutate(SocState = `Estado civil`,
SocState = recode(SocState,
`Casado/a` = "Coupled",
`Soltero` = "Single",
`Separado/a` = "Separated",
`Divorciado/a` = "Divorced",
`Unido/a` = "Coupled",
`Unido` = "Coupled",
`Viudo/a` = "Widowed/Widower")) %>%
rowwise() %>%
mutate(meanCOREOM = meanNotNA(c_across(COREOM01_01:COREOM01_34))) %>%
ungroup() -> tibRawDat
tibRawDat %>%
mutate(SocStatNA = if_else(is.na(SocState), "NA", SocState)) %>%
group_by(SocStatNA) %>%
summarise(tmpN = n(),
tmpMedianAge = median(Edad, na.rm = TRUE),
varAge = var(Edad, na.rm = TRUE),
SDAge = sqrt(varAge)) -> tmpTib
tmpTib %>%
arrange(desc(tmpN)) %>%
select(SocStatNA) %>%
pull() -> tmpVecN
tmpTib %>%
arrange(tmpMedianAge) %>%
select(SocStatNA) %>%
pull() -> tmpVecAge
tibRawDat %>%
mutate(SocStatNA = if_else(is.na(SocState), "NA", SocState),
SocStatN = ordered(SocStatNA,
levels = tmpVecN,
labels = tmpVecN),
SocStateAge = ordered(SocStatNA,
levels = tmpVecAge,
labels = tmpVecAge)) -> tibRawDat
ggplot(data = tibRawDat,
aes(y = Edad, x = SocStatN, fill = SocStatN)) +
geom_boxplot(varwidth = TRUE) +
geom_text(data = tmpTib,
inherit.aes = FALSE,
aes(x = SocStatNA, y = 12, label = tmpN),
vjust = .5) +
geom_text(x = .5, y = 12, label = "n: ",
vjust = .5,
hjust = 0) +
ylim(10, 80) +
scale_fill_discrete(name = "Social status") +
ylab("Age") +
xlab("Social status") +
theme(axis.text.x = element_text(angle = 70, hjust = 1))
ggsave(filename = "AgeBySocStatnBox.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tibRawDat,
aes(y = meanCOREOM, x = SocStatN, fill = SocStatN)) +
geom_boxplot(varwidth = TRUE) +
geom_text(data = tmpTib,
inherit.aes = FALSE,
aes(x = SocStatNA, y = -.2, label = tmpN),
vjust = .5) +
geom_text(x = .5, y = -.2, label = "n: ",
vjust = .5,
hjust = 0) +
ylim(-.5, 4) +
scale_fill_discrete(name = "Social status") +
ylab("CORE-OM score (mean item score)") +
xlab("Social status") +
theme(axis.text.x = element_text(angle = 70, hjust = 1))
ggsave(filename = "COREOMBySocStatnBox.png",
width = 1700,
height = 1470,
units = "px")
tibRawDat %>%
filter(!is.na(`Tiene Hijos`)) %>%
mutate(nChildren = `Especifique cuántos`,
nChildren = if_else(nChildren > 3, 4, nChildren),
nChildren = if_else(`Tiene Hijos` == "NO", 0, nChildren),
nChildren = ordered(nChildren,
levels = 0:4,
labels = c(as.character(0:3),
"4 or more"))) %>%
filter(!is.na(nChildren)) -> tmpTib
ggplot(data = tmpTib,
aes(y = Edad, x = nChildren, fill = nChildren)) +
geom_boxplot(varwidth = TRUE)
tmpTibGaussian %>%
filter(row_number() <= 500) %>%
mutate(samp250 = if_else(row_number() <= 250, value, NA_real_),
samp100 = if_else(row_number() <= 100, value, NA_real_),
samp50 = if_else(row_number() <= 50, value, NA_real_),
samp10 = if_else(row_number() <= 10, value, NA_real_)) %>%
rename(samp500 = value) -> tmpTib
tmpTib %>%
pivot_longer(cols = everything(), names_to = "sample") %>%
mutate(sample = ordered(sample,
levels = paste0("samp", c(500, 250, 100, 50, 10)))) -> tmpTibLong
ggplot(data = tmpTib,
aes(y = samp250)) +
geom_boxplot(fill = "grey", varwidth = TRUE, notch = TRUE) +
ylim(-4, 4) +
xlab("") +
ylab("value") +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank())
ggsave(filename = "notchedGaussianBox1.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tmpTib,
aes(y = samp250)) +
geom_boxplot(fill = "grey", varwidth = TRUE) +
ylim(-4, 4) +
xlab("") +
ylab("value") +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank())
ggsave(filename = "notNotchedGaussianBox1.png",
width = 1700,
height = 1470,
units = "px")
ggplot(data = tmpTibLong,
aes(y = value)) +
geom_boxplot(fill = "grey", varwidth = TRUE, notch = TRUE) +
geom_hline(yintercept = 0) +
ylim(-4, 4) +
facet_grid(cols = vars(sample)) +
xlab("") +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank())
ggsave(filename = "notchedGaussianBoxes2.png",
width = 1700,
height = 1470,
units = "px")
tmpTibGaussian %>%
mutate(simulN = row_number() %% 20) %>%
group_by(simulN) %>%
mutate(samp250 = if_else(row_number() <= 250, value, NA_real_),
samp100 = if_else(row_number() <= 100, value, NA_real_),
samp50 = if_else(row_number() <= 50, value, NA_real_),
samp10 = if_else(row_number() <= 10, value, NA_real_)) %>%
rename(samp500 = value) %>%
ungroup() -> tmpTib
tmpTib %>%
filter(simulN > 1 & simulN <= 5) %>%
pivot_longer(cols = -simulN, names_to = "sample") %>%
mutate(sample = ordered(sample,
levels = paste0("samp", c(500, 250, 100, 50, 10)))) -> tmpTibLong
ggplot(data = tmpTibLong,
aes(y = value)) +
geom_boxplot(fill = "grey", varwidth = TRUE, notch = TRUE) +
geom_hline(yintercept = 0) +
ylim(-4, 4) +
facet_grid(cols = vars(sample),
rows = vars(simulN)) +
xlab("") +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_blank())
ggsave(filename = "notchedGaussianBoxes3.png",
width = 1700,
height = 1470,
units = "px")
tibRawDat %>%
group_by(Gender) %>%
summarise(n = n(),
median = median(Edad, na.rm = TRUE)) -> tmpTibSummary
ggplot(data = tibRawDat,
aes(y = Edad, x = Gender, fill = Gender)) +
geom_boxplot(varwidth = TRUE, notch = TRUE) +
geom_text(data = tmpTibSummary,
aes(x = Gender, y = 14, label = n),
vjust = 0) +
geom_text(x = .90, y = 14, label = "n: ",
hjust = 1,
vjust = 0) +
geom_text(data = tmpTibSummary,
aes(x = Gender, y = 11, label = median),
vjust = 0,
fontface = "plain") +
geom_text(x = .9, y = 11, label = "median: ",
hjust = 1,
vjust = 0,
fontface = "plain") +
geom_hline(yintercept = median(tibRawDat$Edad, na.rm = TRUE)) +
ylab("Age")
ggsave(filename = "AgeByGenderNotchedBox.png",
width = 1700,
height = 1470,
units = "px")
tibRawDat %>%
mutate(SocStatNA = if_else(is.na(SocState), "NA", SocState)) %>%
group_by(SocStatNA) %>%
summarise(tmpN = n(),
tmpMedianAge = median(Edad, na.rm = TRUE),
varAge = var(Edad, na.rm = TRUE),
SDAge = sqrt(varAge)) -> tmpTib
tmpTib %>%
arrange(desc(tmpN)) %>%
select(SocStatNA) %>%
pull() -> tmpVecN
tmpTib %>%
arrange(tmpMedianAge) %>%
select(SocStatNA) %>%
pull() -> tmpVecAge
tibRawDat %>%
mutate(SocStatNA = if_else(is.na(SocState), "NA", SocState),
SocStatN = ordered(SocStatNA,
levels = tmpVecN,
labels = tmpVecN),
SocStateAge = ordered(SocStatNA,
levels = tmpVecAge,
labels = tmpVecAge)) -> tibRawDat
ggplot(data = tibRawDat,
aes(y = Edad, x = SocStatN, fill = SocStatN)) +
geom_boxplot(varwidth = TRUE, notch = TRUE) +
geom_hline(yintercept = median(tibRawDat$Edad, na.rm = TRUE)) +
geom_text(data = tmpTib,
inherit.aes = FALSE,
aes(x = SocStatNA, y = 12, label = tmpN),
vjust = .5) +
geom_text(x = .5, y = 12, label = "n: ",
vjust = .5,
hjust = 0) +
ylim(10, 80) +
scale_fill_discrete(name = "Social status") +
ylab("Age") +
xlab("Social status") +
theme(axis.text.x = element_text(angle = 70, hjust = 1))
ggsave(filename = "AgeBySocStatnNotchedBox.png",
width = 1700,
height = 1470,
units = "px")
tibRawDat %>%
filter(!is.na(meanCOREOM)) %>%
group_by(SocStatN) %>%
summarise(n = n(),
median = round(median(meanCOREOM), 1)) -> tmpTibSummary
ggplot(data = tibRawDat,
aes(y = meanCOREOM, x = SocStatN, fill = SocStatN)) +
geom_boxplot(varwidth = TRUE, notch = TRUE) +
geom_hline(yintercept = median(tibRawDat$meanCOREOM, na.rm = TRUE)) +
geom_text(data = tmpTibSummary,
inherit.aes = FALSE,
aes(x = SocStatN, y = -.3, label = n),
vjust = .5) +
geom_text(x = .7, y = -.3, label = "n: ",
vjust = .5,
hjust = 1) +
geom_text(data = tmpTibSummary,
inherit.aes = FALSE,
aes(x = SocStatN, y = -.5, label = median),
vjust = .5) +
geom_text(x = .7, y = -.5, label = "median: ",
vjust = .5,
hjust = 1) +
ylim(-.5, 4) +
expand_limits(x = -.7) +
scale_fill_discrete(name = "Social status") +
ylab("CORE-OM score (mean item score)") +
xlab("Social status") +
theme(axis.text.x = element_text(angle = 70, hjust = 1))
ggsave(filename = "COREOMBySocStatnBox.png",
width = 1700,
height = 1470,
units = "px")
tibRawDat %>%
filter(!is.na(`Tiene Hijos`)) %>%
mutate(nChildren = `Especifique cuántos`,
nChildren = if_else(nChildren > 3, 4, nChildren),
nChildren = if_else(`Tiene Hijos` == "NO", 0, nChildren),
nChildren = ordered(nChildren,
levels = 0:4,
labels = c(as.character(0:3),
"4 or more"))) %>%
filter(!is.na(nChildren)) -> tmpTib
ggplot(data = tmpTib,
aes(y = Edad, x = nChildren, fill = nChildren)) +
geom_boxplot(varwidth = TRUE)
### start with simple Gaussian n = 50
set.seed(12345)
tmpValSampN <- 50
1:5 %>%
as_tibble() %>%
rename(simN = value) %>%
group_by(simN) %>%
mutate(values = list(rnorm(tmpValSampN))) %>%
ungroup() %>%
unnest_longer(values) %>%
mutate(simN = ordered(simN)) -> tmpTib
ggplot(data = filter(tmpTib, simN == 1),
aes(x = simN, y = values)) +
geom_violin(fill = "grey") +
xlab("")
ggsave(filename = "geomViolin1.png",
width = 1700,
height = 1470,
units = "px")
### add jittered points to the plot
ggplot(data = filter(tmpTib, simN == 1),
aes(x = simN, y = values)) +
geom_violin(fill = "grey") +
geom_jitter(height = 0, width = .07, alpha = .7) +
xlab("")
ggsave(filename = "geomViolin2.png",
width = 1700,
height = 1470,
units = "px")
### no jittering, just transparency
ggplot(data = filter(tmpTib, simN == 1),
aes(x = simN, y = values)) +
geom_violin(fill = "grey") +
geom_point(alpha = .3) +
xlab("")
ggsave(filename = "geomViolin3.png",
width = 1700,
height = 1470,
units = "px")
### now use all five simulations
ggplot(data = tmpTib,
aes(x = simN, y = values)) +
geom_violin(fill = "grey") +
geom_jitter(height = 0, width = .07, alpha = .7) +
geom_hline(yintercept = 0) +
xlab("Simulations")
ggsave(filename = "geomViolin4.png",
width = 1700,
height = 1470,
units = "px")
### now move to different sample sizes
c(50, 100, 200, 500, 1000, 5000, 50000, 500000) -> tmpVecN
### get sample sizes into human readable rather than scientific format
prettyNum(tmpVecN, big.mark = ",", scientific = FALSE) -> tmpVecLabels
tmpVecN %>%
as_tibble() %>%
rename(n = value) %>%
### need rowwise() otherwise dplyr seems to use same seed each time
rowwise() %>%
mutate(values = list(rnorm(n))) %>%
ungroup() %>%
### applying the labels to get discrete and readable variable
mutate(nFac = ordered(n,
labels = tmpVecLabels)) %>%
unnest_longer(values) -> tmpTib
ggplot(data = tmpTib,
aes(x = nFac, y = values)) +
geom_violin(fill = "grey") +
geom_jitter(height = 0, width = .1, alpha = .3) +
geom_hline(yintercept = 0) +
scale_x_discrete()
ggsave(filename = "geomViolin5.png",
width = 1700,
height = 1470,
units = "px")
### but showing the points in the larger samples is mad so ...
ggplot(data = tmpTib,
aes(x = nFac, y = values)) +
geom_violin(fill = "grey") +
geom_jitter(data = filter(tmpTib, n < 5000),
height = 0, width = .15, alpha = .1) +
geom_hline(yintercept = 0) +
scale_x_discrete()
ggsave(filename = "geomViolin6.png",
width = 1700,
height = 1470,
units = "px")
### now some real data
tibRawDat %>%
filter(!is.na(SocState)) %>%
filter(!is.na(Gender)) %>%
mutate(SocStat2 = recode(SocStatNA,
"NA" = NA_character_,
"Divorced" = "Single",
"Separated" = "Single",
"Widowed/Widower" = "Single")) -> tmpTib
tmpTib %>%
group_by(Gender, SocStat2) %>%
summarise(CI = list(getBootCImean(meanCOREOM))) %>%
unnest_wider(CI) -> tmpSummary
tmpSummary
# A tibble: 4 × 5
# Groups: Gender [2]
Gender SocStat2 obsmean LCLmean UCLmean
<chr> <chr> <dbl> <dbl> <dbl>
1 Female Coupled 0.778 0.695 0.861
2 Female Single 1.03 0.978 1.08
3 Male Coupled 0.728 0.670 0.793
4 Male Single 1.03 0.967 1.09
ggplot(data = tmpTib,
aes(x = interaction(Gender, SocStat2), y = meanCOREOM, fill = Gender)) +
geom_violin() +
geom_jitter(height = 0, width = .1, alpha = .2) +
geom_hline(yintercept = mean(tmpTib$meanCOREOM)) +
### add summary statistics
geom_point(data = tmpSummary,
aes(y = obsmean),
position = position_nudge(x = -.15),
size = 2.5) +
geom_linerange(data = tmpSummary,
inherit.aes = FALSE,
aes(x = interaction(Gender, SocStat2),
ymin = LCLmean, ymax = UCLmean),
position = position_nudge(x = -.15),
size = 1.5) +
xlab("Gender and social status") +
ylab("CORE-OM score (mean item score)")
ggsave(filename = "geomViolin7.png",
width = 1700,
height = 1470,
units = "px")
### get the data
tibUseDat %>%
mutate(SocState = `Estado civil`,
SocState = recode(SocState,
`Casado/a` = "Coupled",
`Soltero` = "Single",
`Separado/a` = "Separated",
`Divorciado/a` = "Divorced",
`Unido/a` = "Coupled",
`Unido` = "Coupled",
`Viudo/a` = "Widowed/Widower")) %>%
rowwise() %>%
mutate(meanCOREOM = meanNotNA(c_across(COREOM01_01:COREOM01_34))) %>%
ungroup() -> tibUseDat
### histogram of the raw data
ggplot(data = tibUseDat,
aes(x = meanCOREOM)) +
geom_histogram() +
xlab("CORE-OM total score (item mean)") +
ylab("Count")
[1] 0.9312496
var(tibUseDat$meanCOREOM)
[1] 0.2700728
sd(tibUseDat$meanCOREOM)
[1] 0.5196853
[1] 0.01646686
[1] 0.8989745
[1] 0.9635246
# tibUseDat %>%
# select(Gender, SocState, meanCOREOM) %>%
# drop_na() -> tmpTib
#
# ggplot(data = tmpTib,
# aes(x = Gender, y = meanCOREOM)) +
# geom_boxplot(varwidth = TRUE, notch = TRUE)
#
# ggplot(data = tmpTib,
# aes(x = Gender, y = meanCOREOM)) +
# geom_violin()
### get a well validated jackknife method
bootstrap::jackknife(tmpTib$meanCOREOM, mean)
$jack.se
[1] 0.01688115
$jack.bias
[1] 0
$jack.values
[1] 0.9630860 0.9628791 0.9629973 0.9629418 0.9634407 0.9629678
[7] 0.9630564 0.9627608 0.9628809 0.9634998 0.9629086 0.9631747
[13] 0.9627904 0.9631747 0.9630564 0.9633225 0.9633225 0.9617263
[19] 0.9632338 0.9634112 0.9633986 0.9629086 0.9632929 0.9632042
[25] 0.9627608 0.9631156 0.9629382 0.9624948 0.9628200 0.9630269
[31] 0.9633816 0.9625244 0.9634703 0.9629973 0.9629973 0.9631451
[37] 0.9623327 0.9631451 0.9628495 0.9631451 0.9628495 0.9635294
[43] 0.9629382 0.9625244 0.9628791 0.9632634 0.9625835 0.9629678
[49] 0.9636181 0.9628791 0.9632634 0.9631156 0.9632042 0.9632634
[55] 0.9633682 0.9625835 0.9623766 0.9635885 0.9625835 0.9628791
[61] 0.9632929 0.9629418 0.9637954 0.9633520 0.9635885 0.9637067
[67] 0.9618149 0.9637067 0.9637067 0.9637363 0.9627608 0.9635294
[73] 0.9626722 0.9629113 0.9631451 0.9630269 0.9632338 0.9627608
[79] 0.9632634 0.9633225 0.9632042 0.9636181 0.9620810 0.9630636
[85] 0.9627017 0.9618741 0.9629113 0.9628200 0.9637336 0.9630860
[91] 0.9627904 0.9627017 0.9633816 0.9637954 0.9628791 0.9632042
[97] 0.9627608 0.9626130 0.9627904 0.9632929 0.9632929 0.9626068
[103] 0.9631451 0.9632929 0.9635294 0.9636181 0.9630269 0.9627017
[109] 0.9629770 0.9636476 0.9633225 0.9634703 0.9630860 0.9626426
[115] 0.9634998 0.9636476 0.9635590 0.9632042 0.9632634 0.9631451
[121] 0.9629678 0.9631156 0.9635885 0.9630564 0.9628200 0.9634703
[127] 0.9629382 0.9634998 0.9611055 0.9627608 0.9629382 0.9632634
[133] 0.9627017 0.9630564 0.9632338 0.9631451 0.9625835 0.9631451
[139] 0.9633225 0.9630564 0.9631451 0.9634998 0.9629678 0.9617854
[145] 0.9622879 0.9626722 0.9627904 0.9624357 0.9625835 0.9627608
[151] 0.9628791 0.9626426 0.9632634 0.9633816 0.9627904 0.9629973
[157] 0.9633816 0.9630269 0.9627608 0.9631451 0.9632929 0.9632338
[163] 0.9623766 0.9634998 0.9635590 0.9633816 0.9625835 0.9629722
[169] 0.9619036 0.9626426 0.9625539 0.9629678 0.9632929 0.9634112
[175] 0.9623561 0.9632929 0.9630269 0.9629382 0.9634407 0.9637067
[181] 0.9623174 0.9618149 0.9625539 0.9629678 0.9630860 0.9636181
[187] 0.9632929 0.9630860 0.9624948 0.9627608 0.9627017 0.9629973
[193] 0.9634112 0.9633816 0.9621992 0.9629086 0.9630860 0.9625835
[199] 0.9615785 0.9629086 0.9632929 0.9637032 0.9624948 0.9631451
[205] 0.9629973 0.9629678 0.9631156 0.9624061 0.9620219 0.9625835
[211] 0.9627608 0.9635294 0.9624357 0.9630564 0.9631747 0.9633816
[217] 0.9628495 0.9623631 0.9629276 0.9628504 0.9623661 0.9625244
[223] 0.9630860 0.9637067 0.9633225 0.9631451 0.9637954 0.9632929
[229] 0.9632042 0.9629086 0.9629382 0.9627017 0.9634703 0.9632463
[235] 0.9634900 0.9636181 0.9622109 0.9627017 0.9629973 0.9633520
[241] 0.9631451 0.9632042 0.9627608 0.9633225 0.9634703 0.9631156
[247] 0.9637067 0.9637067 0.9629382 0.9635885 0.9636476 0.9628791
[253] 0.9633520 0.9621105 0.9615785 0.9628200 0.9634703 0.9632929
[259] 0.9631747 0.9633225 0.9629382 0.9631451 0.9630564 0.9632634
[265] 0.9615785 0.9632634 0.9621105 0.9634407 0.9635885 0.9630564
[271] 0.9624948 0.9634703 0.9635885 0.9636476 0.9635590 0.9632634
[277] 0.9629973 0.9627017 0.9634703 0.9633520 0.9618741 0.9629418
[283] 0.9635590 0.9627313 0.9631451 0.9632042 0.9620219 0.9632042
[289] 0.9631451 0.9636727 0.9634998 0.9630860 0.9630880 0.9624948
[295] 0.9635813 0.9631156 0.9628809 0.9615713 0.9628809 0.9626130
[301] 0.9632634 0.9626722 0.9630564 0.9632042 0.9627591 0.9629973
[307] 0.9637363 0.9626722 0.9635813 0.9631451 0.9632929 0.9634112
[313] 0.9636181 0.9627608 0.9631747 0.9628200 0.9632634 0.9634112
[319] 0.9629382 0.9624061 0.9634998 0.9620810 0.9628791 0.9629382
[325] 0.9632042 0.9634703 0.9633520 0.9630269 0.9624652 0.9634112
[331] 0.9627608 0.9631156 0.9629678 0.9630860 0.9635590 0.9631156
[337] 0.9632929 0.9629678 0.9632929 0.9627017 0.9619332 0.9628495
[343] 0.9623766 0.9635885 0.9629722 0.9633225 0.9627608 0.9632929
[349] 0.9632042 0.9629086 0.9628495 0.9628791 0.9632634 0.9627904
[355] 0.9629678 0.9637363 0.9625539 0.9629113 0.9630564 0.9634407
[361] 0.9626130 0.9628200 0.9630860 0.9628791 0.9629382 0.9629678
[367] 0.9632042 0.9634703 0.9634998 0.9629973 0.9635294 0.9626722
[373] 0.9637659 0.9631747 0.9624948 0.9631654 0.9629113 0.9613438
[379] 0.9635590 0.9622879 0.9632042 0.9615489 0.9629382 0.9637945
[385] 0.9636118 0.9621105 0.9627608 0.9631451 0.9630269 0.9632042
[391] 0.9633225 0.9634112 0.9620219 0.9634407 0.9623022 0.9633377
[397] 0.9624061 0.9630564 0.9634112 0.9633520 0.9620586 0.9626130
[403] 0.9614307 0.9624061 0.9628791 0.9627904 0.9618741 0.9627904
[409] 0.9631747 0.9626068 0.9626130 0.9632634 0.9631451 0.9627313
[415] 0.9627017 0.9626130 0.9632929 0.9620810 0.9633225 0.9620514
[421] 0.9628495 0.9632634 0.9627608 0.9633816 0.9628200 0.9622457
[427] 0.9625835 0.9627904 0.9629678 0.9621697 0.9634703 0.9629086
[433] 0.9631747 0.9616376 0.9624357 0.9629418 0.9633225 0.9612829
[439] 0.9625835 0.9627904 0.9628495 0.9632042 0.9632042 0.9631747
[445] 0.9629973 0.9626130 0.9614898 0.9624061 0.9629086 0.9629973
[451] 0.9633816 0.9626981 0.9629722 0.9629086 0.9627017 0.9632634
[457] 0.9636181 0.9625539 0.9626130 0.9621401 0.9627904 0.9625539
[463] 0.9630564 0.9631550 0.9624948 0.9614602 0.9630860 0.9625539
[469] 0.9627017 0.9617558 0.9630269 0.9624357 0.9628200 0.9620219
[475] 0.9621992 0.9625539 0.9622879 0.9622583 0.9625835 0.9633520
[481] 0.9630860 0.9625835 0.9615785 0.9615489 0.9630564 0.9630269
[487] 0.9624948 0.9625244 0.9621992 0.9629973 0.9624061 0.9624652
[493] 0.9627313 0.9633816 0.9629086 0.9625539 0.9626722 0.9629678
[499] 0.9625835 0.9632338 0.9630636 0.9626426 0.9626130 0.9631747
[505] 0.9632929 0.9623174 0.9619923 0.9633225 0.9630564 0.9627313
[511] 0.9631245 0.9622879 0.9631451 0.9633225 0.9627608 0.9621992
[517] 0.9631156 0.9622879 0.9617854 0.9628200 0.9621992 0.9627904
[523] 0.9623766 0.9627313 0.9623174 0.9623174 0.9616967 0.9626722
[529] 0.9619036 0.9629678 0.9634703 0.9632929 0.9631451 0.9634703
[535] 0.9631747 0.9634703 0.9626426 0.9628495 0.9634407 0.9615785
[541] 0.9619332 0.9624545 0.9632634 0.9626722 0.9629086 0.9631747
[547] 0.9625835 0.9632338 0.9623174 0.9629973 0.9623470 0.9632042
[553] 0.9631156 0.9630564 0.9627313 0.9624948 0.9619627 0.9631550
[559] 0.9623022 0.9627482 0.9624652 0.9632929 0.9622879 0.9621697
[565] 0.9631156 0.9628791 0.9624061 0.9630860 0.9624357 0.9627608
[571] 0.9630860 0.9625244 0.9630564 0.9632042 0.9636772 0.9628495
[577] 0.9622879 0.9630269 0.9631747 0.9627608 0.9625539 0.9619036
[583] 0.9625244 0.9624652 0.9617854 0.9616931 0.9629973 0.9634703
[589] 0.9615104 0.9628809 0.9625835 0.9623470 0.9619332 0.9628495
[595] 0.9618445 0.9632463 0.9616671 0.9626722 0.9624948 0.9620810
[601] 0.9615489 0.9621992 0.9624745 0.9627904 0.9623766 0.9626722
[607] 0.9621992 0.9624652 0.9628791 0.9629678 0.9620890 0.9631747
[613] 0.9617263 0.9635885 0.9631451 0.9629973 0.9632338 0.9623470
[619] 0.9626426 0.9627904 0.9628200 0.9633520 0.9627608 0.9622288
[625] 0.9627608 0.9630269 0.9629678 0.9632929 0.9633816 0.9630269
[631] 0.9634407 0.9632338 0.9623936 0.9620810 0.9635885 0.9632634
[637] 0.9630564 0.9625244 0.9628495 0.9625835 0.9635885 0.9633816
[643] 0.9636476 0.9629382 0.9624652 0.9624652 0.9623174 0.9616080
[649] 0.9634998 0.9634703 0.9629678 0.9615193 0.9627608 0.9632338
[655] 0.9632338 0.9631156 0.9632042 0.9632929 0.9631156 0.9626426
[661] 0.9634112 0.9627608 0.9620810 0.9618149 0.9629382 0.9629973
[667] 0.9623470 0.9630860 0.9636772 0.9627608 0.9635294 0.9625244
[673] 0.9631156 0.9632338 0.9630860 0.9633520 0.9635885 0.9630564
[679] 0.9627017 0.9627017 0.9638250 0.9633520 0.9633225 0.9632042
[685] 0.9623470 0.9633986 0.9633816 0.9633225 0.9628495 0.9628791
[691] 0.9626130 0.9629973 0.9625244 0.9626722 0.9628200 0.9635885
[697] 0.9633816 0.9616671 0.9628504 0.9629086 0.9617263 0.9630636
[703] 0.9629418 0.9633816 0.9623936 0.9624652 0.9633520 0.9634407
[709] 0.9618741 0.9624948 0.9626426 0.9636181 0.9628791 0.9614898
[715] 0.9613715 0.9621105 0.9631156 0.9630860 0.9623470 0.9624948
[721] 0.9619332 0.9631451 0.9619332 0.9634112 0.9634998 0.9631451
[727] 0.9632338 0.9615785 0.9621401 0.9634407 0.9630564 0.9626426
[733] 0.9622879 0.9612829 0.9637954 0.9629382 0.9627904 0.9621401
[739] 0.9636772 0.9634900 0.9636181 0.9633520 0.9630564 0.9627591
[745] 0.9632634 0.9629382 0.9633816 0.9625835 0.9617558 0.9630564
[751] 0.9631747 0.9635885 0.9612829 0.9631747 0.9627017 0.9630269
[757] 0.9627017 0.9634112 0.9622583 0.9634112 0.9631854 0.9629382
[763] 0.9634407 0.9631451 0.9636181 0.9628200 0.9632042 0.9615489
[769] 0.9631156 0.9631747 0.9623766 0.9633816 0.9627904 0.9628791
[775] 0.9630860 0.9627608 0.9626426 0.9622288 0.9632042 0.9630269
[781] 0.9615489 0.9630860 0.9631747 0.9632929 0.9629382 0.9632929
[787] 0.9628495 0.9628495 0.9632042 0.9630269 0.9627608 0.9631156
[793] 0.9630941 0.9626426 0.9619332 0.9628495 0.9624357 0.9633225
[799] 0.9619627 0.9629086 0.9634703 0.9630269 0.9634703 0.9623766
[805] 0.9625539 0.9630860 0.9624948 0.9630269 0.9629678 0.9634998
[811] 0.9628495 0.9619627 0.9630564 0.9622288 0.9625244 0.9618454
[817] 0.9629973 0.9635294 0.9618741 0.9625539 0.9629086 0.9627313
[823] 0.9630564 0.9633816 0.9627017 0.9633225 0.9629973 0.9625539
[829] 0.9629973 0.9628791 0.9610464 0.9623470 0.9609873 0.9624357
[835] 0.9619923 0.9632634 0.9615713 0.9620219 0.9628791 0.9624061
[841] 0.9629678 0.9614602 0.9624357 0.9628791 0.9629678 0.9629382
[847] 0.9630860 0.9623470 0.9619923 0.9629678 0.9622879 0.9631451
[853] 0.9637363 0.9627904 0.9631747 0.9626068 0.9635590 0.9634998
[859] 0.9632338 0.9629142 0.9628809 0.9626130 0.9629418 0.9629086
[865] 0.9627904 0.9632634 0.9621697 0.9625244 0.9629382 0.9627904
[871] 0.9631451 0.9630564 0.9626130 0.9632042 0.9619672 0.9631747
[877] 0.9614602 0.9621697 0.9630564 0.9629678 0.9627904 0.9629973
[883] 0.9633520 0.9608099 0.9629678 0.9627608 0.9634595 0.9628791
[889] 0.9631451 0.9624357 0.9627904 0.9625835 0.9630860 0.9633520
[895] 0.9621992 0.9619332 0.9625244 0.9630564 0.9628495 0.9621697
[901] 0.9636476 0.9621697 0.9630564 0.9625244 0.9629382 0.9625154
[907] 0.9625835 0.9617558 0.9628495 0.9620810 0.9618741 0.9625244
[913] 0.9629973 0.9629678 0.9633520 0.9623470 0.9633225 0.9630269
[919] 0.9633225 0.9630860 0.9628495 0.9628200 0.9631747 0.9626130
[925] 0.9636476 0.9629086 0.9628200 0.9620219 0.9632929 0.9617263
[931] 0.9620219 0.9627608 0.9622879 0.9636772 0.9630269 0.9628791
[937] 0.9623174 0.9609873 0.9626426 0.9628495 0.9631451 0.9634998
[943] 0.9637363 0.9636181 0.9631156 0.9630564 0.9630269 0.9633816
[949] 0.9635590 0.9633520 0.9625835 0.9624652 0.9621992 0.9622583
[955] 0.9624061 0.9632042 0.9631451 0.9629382 0.9624357 0.9633816
[961] 0.9617854 0.9629086 0.9627904 0.9632929 0.9636476 0.9629456
[967] 0.9627608 0.9627313 0.9629973 0.9625835 0.9626981 0.9622839
[973] 0.9627313 0.9629086 0.9634998 0.9618149 0.9609281 0.9629382
[979] 0.9625835 0.9623470 0.9624061 0.9626722 0.9625835 0.9619923
[985] 0.9626426 0.9622879 0.9614898 0.9614602 0.9633816 0.9632634
[991] 0.9630860 0.9620219 0.9632338 0.9631747 0.9630398 0.9612237
$call
bootstrap::jackknife(x = tmpTib$meanCOREOM, theta = mean)
### but I'm a masochistic, obsessional idiot so I replicate that!
jackMean <- function(x, confint = .95){
x <- na.omit(x)
valN <- length(x)
valSampMean <- mean(x)
vecJackMeans <- rep(NA, valN)
vecPseudoVals <- rep(NA, valN)
for (i in 1:valN){
vecJackMeans[i] <- mean(x[-i])
vecPseudoVals[i] <- valN * valSampMean - (valN - 1)*vecJackMeans[i]
}
valBias <- (valN - 1) * (mean(vecJackMeans) - valSampMean)
valSE <- sqrt(((valN - 1) / valN) * sum((vecJackMeans - mean(vecJackMeans))^2))
valQt <- qt((1 - (1 - confint)/2), valN - 1)
print(valQt)
LCL <- valSampMean - (valSE * valQt)
UCL <- valSampMean + (valSE * valQt)
return(list(n = valN,
mean = valSampMean,
bias = valBias,
SE = valSE,
confint = confint,
LCL = LCL,
UCL = UCL,
CI = c(LCL, UCL),
vecJackMeans = vecJackMeans,
vecPseudoVals = vecPseudoVals))
}
# jackMean(1:8)
# bootstrap::jackknife(1:8, mean)
jackMean(tmpTib$meanCOREOM) -> listJackknife
[1] 1.962351
bootstrap::jackknife(tmpTib$meanCOREOM, mean)$jack.bias
[1] 0
bootstrap::jackknife(tmpTib$meanCOREOM, mean)$jack.se
[1] 0.01688115
listJackknife$bias
[1] 0
listJackknife$SE
[1] 0.01688115
listJackknife$CI
[1] 0.9297305 0.9959840
range(listJackknife$vecJackMeans)
[1] 0.9608099 0.9638250
[1] 0.9628573
mean(tmpTib$meanCOREOM)
[1] 0.9628573
ggplot(data = tmpTibJackknife,
aes(x = value)) +
geom_histogram() +
geom_vline(xintercept = mean(tmpTibJackknife$value),
colour = "green",
size = 2) +
xlab("Jackknife means") +
ylab("Count")
ggsave(filename = "jackknife2.png",
width = 1700,
height = 1470,
units = "px")
### create an evenly spread small subsample
tmpTib %>%
arrange(meanCOREOM) %>%
mutate(rowN = row_number()) %>%
filter(rowN %% 20 == 10) -> tmpTib2
# max(tmpTib2$rowN)
# nrow(tmpTib2)
ggplot(data = tmpTib2,
aes(x = meanCOREOM)) +
geom_histogram() +
xlab("CORE-OM total score (item mean)") +
ylab("Count")
[1] 0.9687147
var(tmpTib2$meanCOREOM)
[1] 0.296774
[1] 0.01726169
[1] 0.8177122
[1] 1.119717
### jackknifing
jackMean(tmpTib2$meanCOREOM) -> listJackknife2
[1] 2.009575
bootstrap::jackknife(tmpTib2$meanCOREOM, mean)$jack.bias
[1] 0
bootstrap::jackknife(tmpTib2$meanCOREOM, mean)$jack.se
[1] 0.07704206
listJackknife2$bias
[1] 0
listJackknife2$SE
[1] 0.07704206
listJackknife2$CI
[1] 0.8138928 1.1235365
range(listJackknife2$vecJackMeans)
[1] 0.9356632 0.9866836
listJackknife2$vecJackMeans %>%
as_tibble() -> tmpTibJackknife2
ggplot(data = tmpTibJackknife2,
aes(x = value)) +
geom_histogram() +
geom_vline(xintercept = mean(tmpTibJackknife$value),
colour = "green",
size = 2) +
xlab("Jackknife means") +
ylab("Count")
ggsave(filename = "jackknife4.png",
width = 1700,
height = 1470,
units = "px")
Time difference of 0.3149319 secs
[1] 0.8807916 1.0319069
mean(tmpTibBoot$value)
[1] 0.9630155
tmpRes$t0
[1] 0.9628573
ggplot(data = tmpTibBoot,
aes(x = value)) +
geom_histogram() +
geom_vline(xintercept = mean(tmpTibBoot$value),
colour = "green",
size = 3.5) +
geom_vline(xintercept = mean(tmpTib$meanCOREOM),
colour = "blue")
ggsave(filename = "bootstrap1.png",
width = 1700,
height = 1470,
units = "px")
tmpRes$t0
[1] 0.9628573
boot::boot.ci(tmpRes)
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates
CALL :
boot::boot.ci(boot.out = tmpRes)
Intervals :
Level Normal Basic
95% ( 0.9292, 0.9962 ) ( 0.9293, 0.9954 )
Level Percentile BCa
95% ( 0.9303, 0.9964 ) ( 0.9304, 0.9965 )
Calculations and Intervals on Original Scale
Time difference of 0.08940196 secs
[1] 0.7056996 1.2611440
ggplot(data = tmpTibBoot,
aes(x = value)) +
geom_histogram() +
geom_vline(xintercept = mean(tmpTibBoot$value),
colour = "green",
size = 3.5) +
geom_vline(xintercept = mean(tmpTib$meanCOREOM),
colour = "blue",
size = 2)
ggsave(filename = "bootstrap2.png",
width = 1700,
height = 1470,
units = "px")
tmpRes$t0
[1] 0.9687147
boot::boot.ci(tmpRes)
BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
Based on 10000 bootstrap replicates
CALL :
boot::boot.ci(boot.out = tmpRes)
Intervals :
Level Normal Basic
95% ( 0.8214, 1.1181 ) ( 0.8157, 1.1128 )
Level Percentile BCa
95% ( 0.8246, 1.1217 ) ( 0.8347, 1.1363 )
Calculations and Intervals on Original Scale
tribble(~DNA, ~Gender, ~n,
"All", "F", 125,
"All", "M", 125,
"DNA...", "F", 125,
"DNA...", "M", 125) -> tibAssoc125
tibAssoc125 %>%
uncount(n) -> tibAssoc125long
### good old fashioned, well hybrid, way to do this
tibAssoc125long %>%
summarise(chisq = list(unlist(chisq.test(Gender, DNA)))) %>%
unnest_wider(chisq)
# A tibble: 1 × 21
`statistic.X-squared` parameter.df p.value method data.name
<chr> <chr> <chr> <chr> <chr>
1 0 1 1 Pearson's Chi-… Gender a…
# ℹ 16 more variables: observed1 <chr>, observed2 <chr>,
# observed3 <chr>, observed4 <chr>, expected1 <chr>,
# expected2 <chr>, expected3 <chr>, expected4 <chr>,
# residuals1 <chr>, residuals2 <chr>, residuals3 <chr>,
# residuals4 <chr>, stdres1 <chr>, stdres2 <chr>, stdres3 <chr>,
# stdres4 <chr>
### let's see if I can get to understand the purrr and broom approach
tibAssoc125long %>%
### nest() makes all the data into a row with a dataframe in it of all the data
### have to use "data = everything" when not using it after group_by() so it knows to use all columns
nest(data = everything()) %>%
### so now we have a single row tibble so mutate() not summarise()
### this says apply chisq.test to the data computing the chisq.test for Gender and DNA
### this seems horrible syntax to me
mutate(chisq = purrr::map(data, ~chisq.test(.$Gender, .$DNA)),
### this is similar: apply broom::tidy() to all of chisq
### broom::tidy actually uses tidy.htest() seeing that chisq is an htest list
### so it pulls the elements into a list
tidied = purrr::map(chisq, broom::tidy)) %>%
### at this point you've still got all the data in a column data
### and all the output of chisq.test() as a list
### don't need all that!
select(tidied) %>%
### now unnest the elements of tidied
unnest_wider(tidied)
# A tibble: 1 × 4
statistic p.value parameter method
<dbl> <dbl> <int> <chr>
1 0 1 1 Pearson's Chi-squared test
### this was part of my abortive attempt to create tables I could easily
### paste into WordPress by making them into jpegs or pngs using tools
### it gridExtra. I didn't crack it!
# tt3 <- ttheme_default(core=list(fg_params=list(hjust=0, x=0.1)),
# rowhead=list(fg_params=list(hjust=0, x=0)))
# tibAssoc125long %>%
# tabyl(DNA, Gender) %>%
# as.data.frame() -> tmpDF
#
#
# gridExtra::tableGrob(tmpDF, rows = NULL) -> tmpGrob
# png("test.png", height=1000, width=200)
# gridExtra::grid.table(tmpDF, rows = NULL)
# dev.off()
### trying to shift tables to WordPress using csv and the wpDataTables plugin
### recode to long labels
tibAssoc125long %>%
mutate(DNA = recode(DNA,
"All" = "Attended all sessions",
"DNA..." = "DNAed at least one session")) -> tibAssoc125long
### simplest xtab
tibAssoc125long %>%
tabyl(DNA, Gender) %>%
### get top left cell correct
rename(n = DNA) %>%
write_csv(file = "xtab1.csv")
### simplest xtab
tibAssoc125long %>%
tabyl(DNA, Gender) %>%
### get top left cell correct
rename(n = DNA) %>%
flextable::flextable() %>%
flextable::autofit() %>%
flextable::save_as_image(path = "xtab1.png", zoom = 3)
[1] "xtab1.png"
### row percentages: CSV
tibAssoc125long %>%
tabyl(DNA, Gender) %>%
### get top left cell correct
rename(`n (row %)` = DNA) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "row") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns(position = "front") %>%
write_csv(file = "xtab2.csv")
### row percentages: png
tibAssoc125long %>%
tabyl(DNA, Gender) %>%
### get top left cell correct
rename(`n (row %)` = DNA) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "row") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns(position = "front") %>%
flextable::flextable() %>%
flextable::autofit() %>%
flextable::save_as_image(path = "xtab1rowperc.png", zoom = 3)
[1] "xtab1rowperc.png"
### col percentages
tibAssoc125long %>%
tabyl(DNA, Gender) %>%
### get top left cell correct
rename(`n (col %)` = DNA) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "col") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns(position = "front") %>%
write_csv(file = "xtab3.csv")
### total percentages
tibAssoc125long %>%
tabyl(DNA, Gender) %>%
### get top left cell correct
rename(`n (% of total)` = DNA) %>%
adorn_totals(where = c("row", "col")) %>%
adorn_percentages(denominator = "all") %>%
adorn_pct_formatting(digits = 1) %>%
adorn_ns(position = "front") %>%
write_csv(file = "xtab4.csv")
vecCOREwbItems <- c(4,14,17,31)
vecCOREprobItems <- c(2,5,8,11,13,15,18,20,23,27,28,30)
vecCOREfuncItems <- c(1,3,7,10,12,19,21,25,26,29,32,33)
vecCOREriskItems <- c(6,9,16,22,24,34)
vecCOREnrItems <- c(vecCOREwbItems, vecCOREprobItems, vecCOREfuncItems)
vecCOREwbItems <- str_c("COREOM01_", sprintf("%02.0f", vecCOREwbItems))
vecCOREprobItems <- str_c("COREOM01_", sprintf("%02.0f", vecCOREprobItems))
vecCOREfuncItems <- str_c("COREOM01_", sprintf("%02.0f", vecCOREfuncItems))
vecCOREriskItems <- str_c("COREOM01_", sprintf("%02.0f", vecCOREriskItems))
vecCOREnrItems <- str_c("COREOM01_", sprintf("%02.0f", vecCOREnrItems))
tibUseDat %>%
rowwise() %>%
mutate(meanCOREwb = meanNotNA(c_across(all_of(vecCOREwbItems))),
meanCOREprob = meanNotNA(c_across(all_of(vecCOREprobItems))),
meanCOREfunc = meanNotNA(c_across(all_of(vecCOREfuncItems))),
meanCORErisk = meanNotNA(c_across(all_of(vecCOREriskItems))),
meanCOREnr = meanNotNA(c_across(all_of(vecCOREnrItems))),) %>%
ungroup() -> tibUseDat
### had too much overprinting with the full dataset so:
set.seed(12345)
tibUseDat %>%
### my first ever use of slice_sample(): nice
slice_sample(n = 88) -> tibSmallDat
ggplot(data = tibSmallDat,
aes(x = meanCOREnr, y = meanCORErisk)) +
geom_point() +
xlab("CORE-OM non-risk score (mean item score)") +
ylab("CORE-OM risk score (mean item score)") +
coord_fixed(ratio = 1) +
xlim(c(0, 4)) +
ylim(c(0, 4))
ggsave(filename = "scatterpoint.png",
width = 1700,
height = 1700,
units = "px")
ggplot(data = tibSmallDat,
aes(x = meanCOREnr, y = meanCORErisk, colour = Gender, fill = Gender)) +
geom_point() +
xlab("CORE-OM non-risk score (mean item score)") +
ylab("CORE-OM risk score (mean item score)") +
coord_fixed(ratio = 1) +
xlim(c(0, 4)) +
ylim(c(0, 4))
# A tibble: 1 × 1
`n()`
<int>
1 346
# A tibble: 148 × 3
meanCOREnr meanCORErisk n
<dbl> <dbl> <int>
1 0.643 0 19
2 0.536 0 16
3 0.429 0 14
4 0.75 0 14
5 0.607 0 13
6 0.679 0 13
7 1.07 0 13
8 0.286 0 12
9 0.393 0 12
10 0.893 0.167 12
# ℹ 138 more rows
ggplot(data = tibUseDat,
aes(x = meanCOREnr, y = meanCORErisk)) +
geom_point() +
xlab("CORE-OM non-risk score (mean item score)") +
ylab("CORE-OM risk score (mean item score)") +
coord_fixed(ratio = 1) +
xlim(c(0, 4)) +
ylim(c(0, 4))
ggsave(filename = "scatter1point.png",
width = 1700,
height = 1700,
units = "px")
ggplot(data = tibUseDat,
aes(x = meanCOREnr, y = meanCORErisk)) +
geom_point(alpha = .1) +
xlab("CORE-OM non-risk score (mean item score)") +
ylab("CORE-OM risk score (mean item score)") +
coord_fixed(ratio = 1) +
xlim(c(0, 4)) +
ylim(c(0, 4))
ggsave(filename = "scatter1pointAlph.png",
width = 1700,
height = 1700,
units = "px")
ggplot(data = tibUseDat,
aes(x = meanCOREnr, y = meanCORErisk)) +
geom_count() +
xlab("CORE-OM non-risk score (mean item score)") +
ylab("CORE-OM risk score (mean item score)") +
coord_fixed(ratio = 1) +
xlim(c(0, 4)) +
ylim(c(0, 4))
ggsave(filename = "scatter2count.png",
width = 1700,
height = 1700,
units = "px")
ggplot(data = tibUseDat,
aes(x = meanCOREnr, y = meanCORErisk)) +
geom_count(alpha = .3) +
xlab("CORE-OM non-risk score (mean item score)") +
ylab("CORE-OM risk score (mean item score)") +
coord_fixed(ratio = 1) +
xlim(c(0, 4)) +
ylim(c(0, 4))
ggsave(filename = "scatter3count.png",
width = 1700,
height = 1700,
units = "px")
set.seed(12345)
ggplot(data = tibUseDat,
aes(x = meanCOREnr, y = meanCORErisk)) +
# geom_point(colour = "blue", alpha = .2) +
geom_jitter(width = 0, height = .05, alpha = .2) +
xlab("CORE-OM non-risk score (mean item score)") +
ylab("CORE-OM risk score (mean item score)") +
coord_fixed(ratio = 1) +
xlim(c(0, 4)) +
ylim(c(0, 4))
ggsave(filename = "jitter1.png",
width = 1700,
height = 1700,
units = "px")
tibUseDat %>%
# select(starts_with("COREOM01_")) %>%
# select(-COREOM01_35) %>%
select(all_of(vecCOREriskItems)) %>%
na.omit() %>%
pivot_longer(everything()) %>%
group_by(name) %>%
summarise(var = var(value)) %>%
arrange(desc(var)) %>%
slice_head()
# A tibble: 1 × 2
name var
<chr> <dbl>
1 COREOM01_06 0.591
# A tibble: 1 × 2
name var
<chr> <dbl>
1 COREOM01_27 1.65
ggplot(data = tibUseDat,
aes(x = COREOM01_27, y = COREOM01_06)) +
geom_point() +
xlab("Scores on CORE-OM item 27:\nI have felt unhappy") +
ylab("Scores on CORE-OM item 6:\nI have been physically violent to others")
ggsave(filename = "items27and6_plot1.png",
width = 1700,
height = 1700,
units = "px")
set.seed(12345)
ggplot(data = tibUseDat,
aes(x = COREOM01_27, y = COREOM01_06)) +
geom_jitter(width = .4, height = .4) +
xlab("Scores on CORE-OM item 27:\nI have felt unhappy") +
ylab("Scores on CORE-OM item 6:\nI have been physically violent to others")
ggsave(filename = "items27and6_jitter1.png",
width = 1700,
height = 1700,
units = "px")
set.seed(12345)
ggplot(data = tibUseDat,
aes(x = COREOM01_27, y = COREOM01_06)) +
geom_jitter(width = .4, height = .4, alpha = .3) +
xlab("Scores on CORE-OM item 27:\nI have felt unhappy") +
ylab("Scores on CORE-OM item 6:\nI have been physically violent to others")
ggsave(filename = "items27and6_jitter2.png",
width = 1700,
height = 1700,
units = "px")
set.seed(12345)
n <- 710
tibble(i = 1:n) %>%
mutate(x = rnorm(n, mean = 7),
y = x^3 + rnorm(n, sd = .5),
logY = log(y),
sqrtY = sqrt(y)) -> tmpTib
shapiro.test(tmpTib$y)$p.value -> valPShapiroY
valPShapiroYtext <- paste0("Shapiro test p value = ", prettyNum(valPShapiroY, digits = 3))
ggplot(data = tmpTib,
aes(x = y)) +
geom_histogram(bins = 50,
aes(x = y, after_stat(density)),
fill = "darkgrey") +
geom_density(colour = "red",
linewidth = 2) +
annotate("text",
x = 800,
y = .0025,
label = valPShapiroYtext) +
ylab("Density") +
xlab("Value") +
ggtitle("Histogram and smoothed density curve of raw data") +
theme(axis.text.y = element_blank())
# ggsave(filename = "transform_none.png",
# width = 1700,
# height = 1700,
# units = "px")
# tmpTib %>%
# summarise(minY = min(y),
# maxY = max(y),
# logMinY = log(minY),
# logMaxY = log(maxY),
# minLogY = min(logY),
# maxLogY = max(logY),
# logMinSqrtY = sqrt(minY),
# logMaxSqrtY = sqrt(maxY),
# minSqrtY = min(sqrtY),
# maxSqrtY = max(sqrtY)) -> tmpTibLimits
ggplot(data = tmpTib,
aes(x = y, y = logY)) +
geom_point(alpha = .2) +
geom_line(alpha = .2) +
geom_smooth(method = "lm",
se = FALSE) +
ylab("log(y)") +
ggtitle("Plot of log transformation of raw data",
subtitle = "Blue line is best linear fit")
# ggsave(filename = "log_transform_plot.png",
# width = 1700,
# height = 1700,
# units = "px")
shapiro.test(tmpTib$logY)$p.value -> valPShapiroLogY
valPShapiroLogYtext <- paste0("Shapiro test p value = ", prettyNum(valPShapiroLogY, digits = 3))
ggplot(data = tmpTib,
aes(x = logY)) +
geom_histogram(bins = 50,
aes(x = logY, after_stat(density)),
fill = "darkgrey") +
geom_density(colour = "red",
linewidth = 2) +
annotate("text",
x = 5,
y = .9,
label = valPShapiroLogYtext) +
ylab("Density") +
xlab("Log(value)") +
ggtitle("Histogram and density curve of log transformed data") +
theme(axis.text.y = element_blank())
# ggsave(filename = "transform_log.png",
# width = 1700,
# height = 1700,
# units = "px")
ggplot(data = tmpTib,
aes(x = y, y = sqrtY)) +
geom_point(alpha = .2) +
geom_line(alpha = .2) +
geom_smooth(method = "lm",
se = FALSE) +
ylab("sqrt(y)") +
ggtitle("Plot of square root transformation of raw data",
subtitle = "Blue line is best linear fit")
# ggsave(filename = "sqrt_transform_plot.png",
# width = 1700,
# height = 1700,
# units = "px")
shapiro.test(tmpTib$sqrtY)$p.value -> valPShapiroSqrtY
valPShapiroSqrtYtext <- paste0("Shapiro test p value = ", prettyNum(valPShapiroSqrtY, digits = 3))
ggplot(data = tmpTib,
aes(x = sqrtY)) +
geom_histogram(bins = 50,
aes(x = sqrtY, after_stat(density)),
fill = "darkgrey") +
geom_density(colour = "red",
linewidth = 2) +
annotate("text",
x = 28,
y = .12,
label = valPShapiroSqrtYtext) +
ylab("Density") +
xlab("Sqrt(value)") +
ggtitle("Histogram and density curve of\n square root transformed data") +
theme(axis.text.y = element_blank())
# ggsave(filename = "transform_sqrt.png",
# width = 1700,
# height = 1700,
# units = "px")
tibble(x = seq(.05, 4, .05)) %>%
mutate(y = sqrt(x)) -> tmpTib
ggplot(data = tmpTib,
aes(x = x, y = y)) +
geom_point() +
geom_line() +
geom_abline(slope = 1, intercept = 0,
linetype = 3) +
scale_x_continuous(breaks = seq(0, 4, .2)) +
scale_y_continuous(expression(y == sqrt(x)),
breaks = seq(0, 4, .2)) +
labs(title = expression(paste("Plot of ",
sqrt(x),
" against x")),
subtitle = expression(paste(y == x,
" line shows that ",
sqrt(x),
" is smaller than x for x > 1",
" and larger than x for x < 1")))
# ggsave(filename = "transform_sqrt2.png",
# width = 2000,
# height = 2000,
# units = "px")
tibble(ID = as.character(1:3),
x = c(1, 7, 10)) %>%
mutate(mean = mean(x),
discrep = x - mean,
discrepSq = discrep^2) -> tmpTib
tmpTib %>%
summarise(across(x:discrepSq, mean)) %>%
mutate(ID = "Mean:") %>%
select(ID, everything()) -> tmpTibMeans
tmpTib %>%
summarise(across(x:discrepSq, sum)) %>%
mutate(ID = "Sum:") %>%
select(ID, everything()) -> tmpTibSums
tmpTib %>%
bind_rows(tmpTibSums,
tmpTibMeans) %>%
write_csv(file = "variance1.csv")
tmpTib %>%
mutate(y = 1:3,
y0 = 0, ymax = 3) -> tmpTib
ggplot(tmpTib,
aes(x = x)) +
# geom_linerange(aes(x = x, ymin = y0, ymax = ymax)) +
geom_point(aes(y = y)) +
geom_vline(xintercept = 6,
colour = "blue") +
geom_segment(aes(x = x, xend = mean, y = y, yend = y),
colour = "green",
arrow = arrow(length = unit(.15, "inches"),
ends = "both")) +
ylim(0.5, 3.5) +
scale_x_continuous(breaks = 0:12,
limits = c(0, 12)) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.title.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank()) +
ggtitle("Discrepancies between observations and the mean",
subtitle = "Observations are black points, their mean is the blue reference line")
ggsave(filename = "discrepancies.png",
width = 2000,
height = 2000,
units = "px")
tibble(n = 3:50) %>%
mutate(meanDiscrepSq = 5,
estPopVar = meanDiscrepSq * n / (n - 1)) -> tmpTib
ggplot(data = tmpTib,
aes(x = n, y = estPopVar)) +
geom_point() +
geom_hline(yintercept = 5,
colour = "blue") +
ylab("Variance") +
ylim(c(0, 8)) +
ggtitle("Plot of estimated population variance for observed variance in sample of 5",
subtitle = "Blue line shows observed variance")
ggsave(filename = "varianceBias1.png",
width = 2000,
height = 2000,
units = "px")
Now a simulation.
tibble(n = c(3:10, seq(15, 50, 5))) %>%
mutate(rep = list(1:1000)) %>%
unnest_longer(rep) %>%
rowwise() %>%
mutate(sample = list(rnorm(n, sd = sqrt(5))),
varEst = var(sample),
varObs = varEst * (n - 1) / n) %>%
ungroup() %>%
select(-sample) -> tmpTib
tmpTib %>%
group_by(n) %>%
summarise(meanVarEst = mean(varEst),
meanVarObs = mean(varObs)) -> tmpTibMeans
ggplot(data = tmpTib,
aes(x = n)) +
geom_violin(aes(y = varObs, group = n),
fill = "grey") +
geom_point(data = tmpTibMeans,
aes(x = n, y = meanVarObs)) +
geom_hline(yintercept = 5,
colour = "blue") +
ylab("Estimated population variance") +
xlab("Sample size (n)") +
ggtitle("Observed variances i.e. denominator n, against sample size",
subtitle = paste0("Violin plot shapes show actual values from n = 1,000 simulations",
"\nDots mark mean values per sample size, blue line is true population variance, 5"))
# ggsave(filename = "varianceBias2.png",
# width = 2000,
# height = 2000,
# units = "px")
That clearly shows the bias: underestimating variance and that it decreases with sample size.
ggplot(data = tmpTib,
aes(x = n)) +
geom_violin(aes(y = varEst, group = n),
fill = "grey") +
geom_point(data = tmpTibMeans,
aes(x = n, y = meanVarEst)) +
geom_hline(yintercept = 5,
colour = "blue") +
ylab("Estimated population variance") +
xlab("Sample size (n)") +
ggtitle("Plot of estimated variances against sample size",
subtitle = paste0("Violin plot shapes show actual values from n = 1,000 simulations",
"\nDots mark mean values per sample size, blue line is true population variance, 5"))
# ggsave(filename = "varianceBias3.png",
# width = 2000,
# height = 2000,
# units = "px")
That shows that using the denominator of n - 1 corrects the bias and estimates the population variance (set at 5) without bias.
Very simple stuff for the linear regression entry.
tibble(x = seq(-1, 4, .02)) %>%
mutate(y = .5 * x + 1) -> tmpTib
ggplot(data = tmpTib,
aes(x = x, y = y)) +
geom_line(linewidth = 2) +
geom_vline(xintercept = 0,
linetype = 1) +
geom_hline(yintercept = 0,
linetype = 1) +
annotate("text",
x = .1, y = 2.2,
parse = TRUE,
label = "y == .5 * x + 1",
hjust = 0,
size = 10) +
ggtitle("Illustration of simple linear regression line") +
theme(plot.title = element_text(size = 24),
plot.subtitle = element_text(size = 18))
# ggsave(filename = "linearRegression1.png",
# width = 3000,
# height = 2000,
# units = "px")
Label the intercept
tibble(x = 0,
y = 1) -> tmpTibIntercept
ggplot(data = tmpTib,
aes(x = x, y = y)) +
geom_line(linewidth = 2) +
geom_vline(xintercept = 0,
linetype = 1) +
geom_hline(yintercept = 0,
linetype = 1) +
geom_point(data = tmpTibIntercept,
size = 5) +
geom_hline(yintercept = 1,
linetype = 3) +
annotate("text",
x = .1, y = 2.2,
parse = TRUE,
label = "y == .5 * x + 1",
hjust = 0,
size = 10) +
ggtitle("Illustration of simple linear regression line",
subtitle = "Intercept point labelled with reference line") +
theme(plot.title = element_text(size = 24),
plot.subtitle = element_text(size = 18))
# ggsave(filename = "linearRegression2intercept.png",
# width = 3000,
# height = 2000,
# units = "px")
Explain the slope
ggplot(data = tmpTib,
aes(x = x, y = y)) +
geom_line(linewidth = 1) +
geom_vline(xintercept = 0,
linetype = 1) +
geom_hline(yintercept = 0,
linetype = 1) +
geom_point(data = tmpTibIntercept,
size = 5) +
geom_hline(yintercept = 1,
linetype = 3) +
geom_segment(aes(x = 2, xend = 3, y = 2, yend = 2),
arrow = arrow(ends = "last",
angle = 12,
length = unit(5, "mm"),
type = "closed")) +
geom_segment(aes(x = 3, xend = 3, y = 2, yend = 2.5),
arrow = arrow(ends = "last",
angle = 12,
length = unit(5, "mm"),
type = "closed")) +
annotate("text",
x = 2.5, y = 1.9,
label = "1",
hjust = .5,
size = 5) +
annotate("text",
x = 3.13, y = 2.25,
label = "0.5",
hjust = .5,
size = 5) +
annotate("text",
x = .1, y = 2.2,
parse = TRUE,
label = "y == .5 * x + 1",
hjust = 0,
size = 10) +
ggtitle("Illustration of simple linear regression line",
subtitle = "Slope explained") +
theme(plot.title = element_text(size = 24),
plot.subtitle = element_text(size = 18))
# ggsave(filename = "linearRegression3slope.png",
# width = 3000,
# height = 2000,
# units = "px")
Add noise. (I confess I tinkered with the seed to get what I wanted!)
set.seed(1)
nrow(tmpTib) -> n
tmpTib %>%
mutate(y2 = y + rnorm(n, mean = 0, sd = .25)) -> tmpTib2
ggplot(data = tmpTib2,
aes(x = x, y = y2)) +
geom_line(aes(y = y)) +
geom_point(alpha = .5) +
geom_smooth(method = "lm") +
geom_vline(xintercept = 0,
linetype = 1) +
geom_hline(yintercept = 0,
linetype = 1) +
annotate("text",
x = .1, y = 2.2,
parse = TRUE,
label = "y == .5 * x + 1",
hjust = 0,
size = 10) +
ggtitle("Illustration of simple linear regression line",
subtitle = "With random Gaussian noise added") +
theme(plot.title = element_text(size = 24),
plot.subtitle = element_text(size = 18))
# ggsave(filename = "linearRegression1noisy.png",
# width = 3000,
# height = 2000,
# units = "px")
### get the lm
lm(tmpTib2$y2 ~ tmpTib2$x) -> tmpLm
The intercept of that best fit linear regression is 1.022 and the slope is 0.489.
This was just to create a plot and table for a glossary entry about attenuation by unreliability of measurement.
set.seed(12345)
n <-5000 # the number of points per simulation
nNoise <- 6 # the number of levels of noise I am going to inject
matCorr <- matrix(c(1, .9, .9, 1), nrow = 2) # correlation matrix I want (pass it into MASS::mvrnorm())
vecMean <- c(0, 0) # mean vector also needed for MASS::mvrnorm()
as_tibble(MASS::mvrnorm(n = n * nNoise, mu = vecMean, Sigma = matCorr), # does the simulation of the samples
### need this to stop as_tibble() grumbling
.name_repair = "unique") %>%
### but I want better variable names so ...
rename(xpopn = `...1`,
ypopn = `...2`) %>%
### now create a long variable of the levels of noise to add
mutate(noise = rep(seq(0, 1, .2), each = n)) %>%
### now use those noise values to add the noise!
rowwise() %>%
mutate(x = xpopn + rnorm(1, sd = noise),
y = ypopn + rnorm(1, sd = noise)) %>%
ungroup() -> tmpTib
### plot that (I never got a plot I really liked but it will do)
ggplot(data = tmpTib,
aes(x = x, y = y)) +
facet_wrap(facets = vars(noise),
ncol = 2) +
geom_smooth(method = "lm") +
geom_point(alpha = .05) +
### for Rblog
ggpubr::stat_cor(size = 3, label.y = 4.2) +
ggpubr::stat_regline_equation(size = 3, label.y = 2.5)
### to save for the glossary
# ggpubr::stat_cor(size = 5, label.y = 4.2) +
# ggpubr::stat_regline_equation(size = 5, label.y = 2.5)
# ggsave(filename = "attenuation.png",
# height = 2400,
# width = 3200,
# units = "px")
I think that shows clearly enough the impact of the added noise attenuating the correlation and reducing the slope
tmpTib %>%
group_by(noise) %>%
summarise(sampCorr = cor(x, y)) %>%
mutate(rel = 1 / (1 + noise^2),
attCorrectedCorr = sampCorr / rel) -> tmpTibSummary
tmpTibSummary %>%
select(noise, rel, sampCorr, attCorrectedCorr) %>%
flextable() %>%
colformat_double(digits = 3)
noise | rel | sampCorr | attCorrectedCorr |
---|---|---|---|
0.000 | 1.000 | 0.903 | 0.903 |
0.200 | 0.962 | 0.867 | 0.902 |
0.400 | 0.862 | 0.764 | 0.886 |
0.600 | 0.735 | 0.654 | 0.889 |
0.800 | 0.610 | 0.543 | 0.891 |
1.000 | 0.500 | 0.453 | 0.907 |
tmpTibSummary %>%
mutate(across(everything(), ~ sprintf("%05.3f", .x))) %>%
write_csv(file = "attenuation.csv")
And that showed that the correction:
\[correctedCorr = \frac{corr}{\sqrt{rel_{x}rel_{y}}}\]
works!
save.image(file = "Glossary.rda")
load(file = "Glossary.rda")
27/04/2024 at 17:03
Text and figures are licensed under Creative Commons Attribution CC BY-SA 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Evans (2021, Nov. 9). Chris (Evans) R SAFAQ: For OMbook glossary. Retrieved from https://www.psyctc.org/R_blog/posts/2021-11-09-ombookglossary/
BibTeX citation
@misc{evans2021for, author = {Evans, Chris}, title = {Chris (Evans) R SAFAQ: For OMbook glossary}, url = {https://www.psyctc.org/R_blog/posts/2021-11-09-ombookglossary/}, year = {2021} }