pacman::p_load(tidyverse, plotly, crosstalk, DT, ggdist, gganimate,ggstatsplot,readxl, performance, parameters, see)Hands-on_Ex04
Visual Statistical Analysis with ggstatsplot
Import package and data
library(readr)
exam_data <- read_csv("data/Exam_data.csv")One sample test graph
set.seed(1234)
gghistostats(data=exam_data,x=ENGLISH, type="bayes",
test.values=70,xlab="English score")
Two sample mean test
Compare distribution/density of female and male performance in Math test
ggbetweenstats(
data=exam_data,
x=GENDER,
y=MATHS,
type='np',
message=FALSE)
One way ANOVA test
ggbetweenstats(data=exam_data,
x=RACE,
y=ENGLISH,
type='p',
mean.ci=TRUE,
pairwise.comparisons = TRUE,
pairwise.display = 's',
p.adjust.method = 'fdr',
message=FALSE
)
Correlatin test
Can see Pearson correlation coefficient
ggscatterstats(data=exam_data,
x=MATHS,
y=ENGLISH,
marginal = FALSE
)
Significant Test of Association
exam=exam_data %>%
mutate(MATHS_bins=
cut(MATHS,
breaks=c(0,60,75,85,100)))
ggbarstats(data=exam,
x=MATHS_bins,
y=GENDER)
Toyota Corolla case with linear regression
Import the data
resale_car <- read_xls("data/ToyotaCorolla.xls",
"data")
colnames(resale_car) [1] "Id" "Model" "Price" "Age_08_04"
[5] "Mfg_Month" "Mfg_Year" "KM" "Quarterly_Tax"
[9] "Weight" "Guarantee_Period" "HP_Bin" "CC_bin"
[13] "Doors" "Gears" "Cylinders" "Fuel_Type"
[17] "Color" "Met_Color" "Automatic" "Mfr_Guarantee"
[21] "BOVAG_Guarantee" "ABS" "Airbag_1" "Airbag_2"
[25] "Airco" "Automatic_airco" "Boardcomputer" "CD_Player"
[29] "Central_Lock" "Powered_Windows" "Power_Steering" "Radio"
[33] "Mistlamps" "Sport_Model" "Backseat_Divider" "Metallic_Rim"
[37] "Radio_cassette" "Tow_Bar"
Build multiple linear regression
model <- lm(Price ~ Age_08_04 + Mfg_Year + KM +
Weight + Guarantee_Period, data = resale_car)
model
Call:
lm(formula = Price ~ Age_08_04 + Mfg_Year + KM + Weight + Guarantee_Period,
data = resale_car)
Coefficients:
(Intercept) Age_08_04 Mfg_Year KM
-2.637e+06 -1.409e+01 1.315e+03 -2.323e-02
Weight Guarantee_Period
1.903e+01 2.770e+01
Check multicollinearity
One way to detect multicollinearity (whether independent variables are highly correlated) is to calculate the variance inflation factor (VIF) for each independent variable.
c <- check_collinearity(model)
plot(c)
Checking normality assumption
Build model1(remove one highly correlated variable of mfg_year)
model1 <- lm(Price ~ Age_08_04 + KM +
Weight + Guarantee_Period, data = resale_car)
check_n <- check_normality(model1)
plot(check_n)
Check model for homogeneity of variances
Significance testing for linear regression models assumes that the model errors (or residuals) have constant variance.
check_v <- check_heteroscedasticity(model1)
plot(check_v)
Complete check
Can also check all the assumptions by one step. Influential observation is an observation in a dataset that, when removed, dramatically changes the coefficient estimates of a regression model
check_model(model1)
Parameter plot
See the coefficient direction and strength in the plot.
plot(parameters(model1))
Visualising Regression Parameters
ggcoefstats(model1,
output = "plot")
Visualize uncertainty of point estimates
point estimate such as mean, addressed with uncertainty like CI se: standard error measures the variability of the sample means, estimate the precision of the sample mean as an estimate of the population mean.
sd/sqrt(n-1), n-1 can been thought as degree of freedom
sum_num <- exam_data %>%
group_by(RACE) %>%
summarise(n=n(),
mean=round(mean(MATHS),2),
sd=round(sd(MATHS),2)) %>%
mutate(se=round(sd/sqrt(n-1),2))
sum_num# A tibble: 4 × 5
RACE n mean sd se
<chr> <int> <dbl> <dbl> <dbl>
1 Chinese 193 76.5 15.7 1.13
2 Indian 12 60.7 23.4 7.04
3 Malay 108 57.4 21.1 2.04
4 Others 9 69.7 10.7 3.79
knitr::kable(head(sum_num),format='html')| RACE | n | mean | sd | se |
|---|---|---|---|---|
| Chinese | 193 | 76.51 | 15.69 | 1.13 |
| Indian | 12 | 60.67 | 23.35 | 7.04 |
| Malay | 108 | 57.44 | 21.13 | 2.04 |
| Others | 9 | 69.67 | 10.72 | 3.79 |
Standard error visulization
ggplot(sum_num)+
geom_errorbar(
aes(x=RACE,
ymin=mean-se,
ymax=mean+se),
width=0.2,
color='black',
alpha=0.9,
size=1)+
geom_point(aes(x=RACE,
y=mean),
stat='identity',
color='red',
size=2,
alpha=1)+
ggtitle("Standard error of mean
maths score by race")
95% Confidence interval
use qnorm(0.975)=1.96 to calculate lower and upper bound
sum_num$RACE <- factor(sum_num$RACE,levels = sum_num$RACE[order(-sum_num$mean)])
ggplot(sum_num)+
geom_errorbar(
aes(x=RACE,
ymin=mean-1.96*se,
ymax=mean+1.96*se),
width=0.2,
color='black',
alpha=0.95,
size=1)+
geom_point(aes(x=RACE,
y=mean),
stat='identity',
color='red',
size=2,
alpha=1)+
ggtitle("95% confidence interval of mean maths score by race")
Uncertainty of point estimates with interactive error bars
data <- highlight_key(sum_num)
p <- ggplot(data)+
geom_errorbar(
aes(x=RACE,
ymin=mean-2.32*se,
ymax=mean+2.32*se),
width=0.2,
color='black',
alpha=0.99,
size=1)+
geom_point(aes(x=RACE,
y=mean),
stat='identity',
color='red',
size=2,
alpha=1)+
ggtitle("99% confidence interval of mean maths score by race")
gg <- highlight(ggplotly(p),"plotly_selected")
crosstalk::bscols(gg,DT::datatable(data),widths = 5)Confidence interval plot with ggdist
exam_data %>%
ggplot(aes(x=RACE,y=MATHS,))+
stat_pointinterval()+
labs(
title="Visualising confidence intervals of mean math score",
subtitle = "Mean Point + Confidence-interval plot")
Use stat_gradientinterval
exam_data %>%
ggplot(aes(x = RACE,
y = MATHS)) +
stat_gradientinterval(
fill='skyblue',
show.legend=TRUE
)+
labs(
title = "Visualising confidence intervals of mean math score",
subtitle = "Gradient + interval plot"
)
Hypothetical Outcome Plots
library(ungeviz)Sample 25 data each time, and plot horizontal line grouping by race.
ggplot(data=exam_data,
aes(x=factor(RACE),y=MATHS))+
geom_point(position=position_jitter(),size=0.5)+
geom_hpline(data=sampler(25,group = RACE),color = "#D55E00",size=0.1)+
theme_bw()
Transition_states means create sequence of frames to have animation of changes
Draw indicating generating a column of sampling, starting with first frame to the twentieth frame
ggplot(data=exam_data,
aes(x=factor(RACE),y=MATHS))+
geom_point(position=position_jitter(),size=0.5)+
geom_hpline(data=sampler(25,group = RACE),color = "#D55E00",size=0.1)+
theme_bw()+
transition_states(.draw,1,20)
exam_data# A tibble: 322 × 7
ID CLASS GENDER RACE ENGLISH MATHS SCIENCE
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 Student321 3I Male Malay 21 9 15
2 Student305 3I Female Malay 24 22 16
3 Student289 3H Male Chinese 26 16 16
4 Student227 3F Male Chinese 27 77 31
5 Student318 3I Male Malay 27 11 25
6 Student306 3I Female Malay 31 16 16
7 Student313 3I Male Chinese 31 21 25
8 Student316 3I Male Malay 31 18 27
9 Student312 3I Male Malay 33 19 15
10 Student297 3H Male Indian 34 49 37
# ℹ 312 more rows
Funnel Plots for Fair Comparisons
pacman::p_load(tidyverse, FunnelPlotR, plotly, knitr)covid19 <- read_csv("data/COVID-19_DKI_Jakarta.csv") %>%
mutate_if(is.character,as.factor)
covid19# A tibble: 267 × 7
`Sub-district ID` City District `Sub-district` Positive Recovered Death
<dbl> <fct> <fct> <fct> <dbl> <dbl> <dbl>
1 3172051003 JAKARTA U… PADEMAN… ANCOL 1776 1691 26
2 3173041007 JAKARTA B… TAMBORA ANGKE 1783 1720 29
3 3175041005 JAKARTA T… KRAMAT … BALE KAMBANG 2049 1964 31
4 3175031003 JAKARTA T… JATINEG… BALI MESTER 827 797 13
5 3175101006 JAKARTA T… CIPAYUNG BAMBU APUS 2866 2792 27
6 3174031002 JAKARTA S… MAMPANG… BANGKA 1828 1757 26
7 3175051002 JAKARTA T… PASAR R… BARU 2541 2433 37
8 3175041004 JAKARTA T… KRAMAT … BATU AMPAR 3608 3445 68
9 3171071002 JAKARTA P… TANAH A… BENDUNGAN HIL… 2012 1937 38
10 3175031002 JAKARTA T… JATINEG… BIDARA CINA 2900 2773 52
# ℹ 257 more rows
PR: proportional ratio, indicates that the data represents the ratio of the numerator (deaths) to the denominator (positive cases) for each sub-district
funnel_plot(numerator = covid19$Death,denominator = covid19$Positive,
group=covid19$`Sub-district`,
data_type = "PR",
x_range = c(0,6500),
y_range=c(0,0.05),
label=NA,
title = "COVID-19 Fatality Rate by Positive Cases",
x_label="Cumulative COVID-19 Positive Cases",
y_label="Cumulative Fatality Rate"
)
A funnel plot object with 267 points of which 7 are outliers.
Plot is adjusted for overdispersion.
Customnized funnel plot
Standard error formula for probability: √ [p (1-p) / n)
Use reciprocal and square so bigger standard error can have bigger weight in the weighted mean
seq function generates a sequence of number from 1 to maximum value of positive, incremental by 1, so that can count confidence interval of different sample size
Show code
df <- covid19 %>%
mutate(rate=Death/Positive) %>%
mutate(rate.se=sqrt(rate*(1-rate)/Positive)) %>%
filter(rate>0)
fit.mean <- weighted.mean(df$rate,1/(df$rate.se^2))
number <- seq(1,max(df$Positive),1)
upper.95 <- fit.mean+1.96*sqrt(fit.mean*(1-fit.mean)/number)
lower.95 <- fit.mean-1.96*sqrt(fit.mean*(1-fit.mean)/number)
upper.99 <- fit.mean+3.29*sqrt(fit.mean*(1-fit.mean)/number)
lower.99 <- fit.mean-3.29*sqrt(fit.mean*(1-fit.mean)/number)
table <- data.frame(upper.95,lower.95,upper.99,lower.99,number,fit.mean)
p <-ggplot(df,aes(x=Positive,y=rate))+
geom_point(aes(label=(label=`Sub-district`),alpha=0.4))+
geom_line(data=table,aes(x=number,y=upper.95),size = 0.4,
colour = "grey40",
linetype = "dashed")+
geom_line(data=table,aes(x=number,y=lower.95),size = 0.4,
colour = "grey40",
linetype = "dashed")+
geom_line(data=table,aes(x=number,y=upper.99),size = 0.4,
colour = "grey40",
linetype = "dashed")+
geom_line(data=table,aes(x=number,y=lower.99),size = 0.4,
colour = "grey40",
linetype = "dashed")+
geom_hline(data=table,aes(yintercept=fit.mean),size = 0.4,
colour = "grey40")+
coord_cartesian(ylim=c(0,0.05)) +
annotate("text", x = 1, y = -0.13, label = "95%", size = 3, colour = "grey40") +
annotate("text", x = 4.5, y = -0.18, label = "99%", size = 3, colour = "grey40") +
ggtitle("Cumulative Fatality Rate by Cumulative Number of COVID-19 Cases") +
xlab("Cumulative Number of COVID-19 Cases") +
ylab("Cumulative Fatality Rate") +
theme_light() +
theme(plot.title = element_text(size=12),
legend.position = c(0.91,0.85),
legend.title = element_text(size=7),
legend.text = element_text(size=7),
legend.background = element_rect(colour = "grey60", linetype = "dotted"),
legend.key.height = unit(0.3, "cm"))
p
Interactive plot
interative_p <-ggplotly(p,tooltip=c("label","x","y"))
interative_p