https://www.bookdown.org/rwnahhas/RMPH/blr-orlr.html
data: https://www.bookdown.org/rwnahhas/RMPH/appendix-nsduh.html#appendix-nsduh
getwd()
# 보통 C:/Users/Username/Documents/
setwd("~/rData")
Odds (승산): 한 사건이 일어날 확률과 그 반대의 확률 (일어나지 않을 확률) 간의 비율 한 사건이 일어날 확률과 다름에 주의하라.
\begin{align}
\displaystyle \frac {p} {1-p}
\end{align}
> log(1/6) [1] -1.791759 > log(6) [1] 1.791759 >
Odds ratio (승비): Odds ratio는 두 odds 간의 비율을 말한다.
##########
# see youtube
# https://youtu.be/8nm0G-1uJzA
n.mut <- 23+117
n.norm <- 6+210
p.cancer.mut <- 23/(23+117)
p.cancer.norm <- 6/(6+210)
set.seed(1011)
c <- runif(n.mut, 0, 1)
# 0 = not cancer, 1 = cancer among mutant gene
mutant <- ifelse(c>=p.cancer.mut, 0, 1)
c <- runif(n.norm, 0, 1)
# 0 = not cancer, 1 = cancer among normal gene
normal <- ifelse(c>=p.cancer.norm, 0, 1)
# 0 = mutant; 1 = normal
gene <- c(rep(0, length(mutant)), rep(1, length(normal)))
# 0 = not cancer; 1 = cancer
cancer <- c(mutant, normal)
df <- as.data.frame(cbind(gene, cancer))
df
df$gene <- factor(df$gene, levels = c(0,1), labels = c("mutant", "norm"))
df$cancer <- factor(df$cancer, levels = c(0,1), labels = c("nocancer", "cancer"))
df
tab <- table(df)
tab
tab[1,2]
tab[1,1]
# p.c.m = p.cancer.mut the above
p.cancer.mutant <- tab[1,2]/(tab[1,1]+tab[1,2])
p.nocancer.mutant <- tab[1,1]/(tab[1,1]+tab[1,2])
p.cancer.mutant
1-p.cancer.mutant
p.nocancer.mutant
p.cancer.norm <- tab[2,2]/(tab[2,1]+tab[2,2])
p.nocancer.norm <- 1-p.cancer.norm
p.cancer.norm
p.nocancer.norm
odds(p.cancer.mutant)
odds(p.cancer.norm)
odds.ratio(p.cancer.mutant, p.cancer.norm)
> ##########
> # see youtube
> # https://youtu.be/8nm0G-1uJzA
> n.mut <- 23+117
> n.norm <- 6+210
> p.cancer.mut <- 23/(23+117)
> p.cancer.norm <- 6/(6+210)
>
> set.seed(1011)
> c <- runif(n.mut, 0, 1)
> # 0 = not cancer, 1 = cancer among mutant gene
> mutant <- ifelse(c>=p.cancer.mut, 0, 1)
>
> c <- runif(n.norm, 0, 1)
> # 0 = not cancer, 1 = cancer among normal gene
> normal <- ifelse(c>=p.cancer.norm, 0, 1)
>
> # 0 = mutant; 1 = normal
> gene <- c(rep(0, length(mutant)), rep(1, length(normal)))
> # 0 = not cancer; 1 = cancer
> cancer <- c(mutant, normal)
>
> df <- as.data.frame(cbind(gene, cancer))
> df
gene cancer
1 0 0
2 0 1
3 0 0
4 0 0
5 0 0
6 0 0
>
> df$gene <- factor(df$gene, levels = c(0,1), labels = c("mutant", "norm"))
> df$cancer <- factor(df$cancer, levels = c(0,1), labels = c("nocancer", "cancer"))
> df
gene cancer
1 mutant nocancer
2 mutant cancer
3 mutant nocancer
4 mutant nocancer
5 mutant nocancer
6 mutant nocancer
>
> tab <- table(df)
> tab
cancer
gene nocancer cancer
mutant 121 19
norm 210 6
> tab[1,2]
[1] 19
> tab[1,1]
[1] 121
>
> # p.c.m = p.cancer.mut the above
> p.cancer.mutant <- tab[1,2]/(tab[1,1]+tab[1,2])
> p.nocancer.mutant <- tab[1,1]/(tab[1,1]+tab[1,2])
> p.cancer.mutant
[1] 0.1357143
> 1-p.cancer.mutant
[1] 0.8642857
> p.nocancer.mutant
[1] 0.8642857
>
> p.cancer.norm <- tab[2,2]/(tab[2,1]+tab[2,2])
> p.nocancer.norm <- 1-p.cancer.norm
> p.cancer.norm
[1] 0.02777778
> p.nocancer.norm
[1] 0.9722222
>
> odds(p.cancer.mutant)
[1] 0.1570248
> odds(p.cancer.norm)
[1] 0.02857143
> odds.ratio(p.cancer.mutant, p.cancer.norm)
[1] 5.495868
>
여기서
\begin{align*}
ln(x) & = y \\
log_e {x} & = y \\
x & = e^{y} \\
\end{align*}
위에서
> load("nsduh2019_adult_sub_rmph.RData")
> # Shorter name
> nsduh <- nsduh_adult_sub
> tab <- table(nsduh$demog_sex, nsduh$mj_lifetime)
> tab
No Yes
Male 206 260
Female 285 249
>
# Marijuana experience (me) in lifetime NO YES Male 206 260 466 Female 285 249 534 491 509 1000 P(me among males) = 260 / 466 = 0.5579399 P(me among females) = 249 / 534 = 0.4662921 Odds for males = 260 / 206 = 1.262136 Odds for females = 249 / 285 = 0.8736842 Odds ratio between males and females = (260 / 206) / (249 / 285) = 1.262136 / 0.8736842 = 1.444613
odds <- function(p) p/(1-p) odds.ratio <- function(p1, p2) odds(p1)/odds(p2) logit <- function(p) log(p/(1-p)) ilogit <- function(x) exp(x)/(1+exp(x)) # exp() is the exponential function
pm <- tab[1,2]/(tab[1,1]+tab[1,2]) pf <- tab[2,2]/(tab[2,1]+tab[2,2]) om <- odds(pm) of <- odds(pf) ormf <- odds.ratio(pm,pf) pm pf om of ormf > pm [1] 0.5579399 > pf [1] 0.4662921 > om [1] 1.262136 > of [1] 0.8736842 > ormf [1] 1.444613 > x1 <- logit(pm) x2 <- logit(pf) x1 x2 ilogit(x1) ilogit(x2) > x1 <- logit(pm) > x2 <- logit(pf) > x1 [1] 0.2328055 > x2 [1] -0.1350363 > ilogit(x1) [1] 0.5579399 > ilogit(x2) [1] 0.4662921 > >
\begin{align*} \displaystyle ln \left( {\frac{p}{(1-p)}} \right) = a + bX \end{align*}
\begin{align} ln \left( {\frac{p}{1-p}} \right) & = a + bX \nonumber \\ \frac{p}{1-p} & = e^{a+bX} \nonumber \\ p & = e^{a+bX} * (1-p) \nonumber \\ p & = e^{a+bX} - p * \left(e^{a+bX} \right) \nonumber \\ p + p * \left(e^{a+bX} \right) & = e^{a+bX} \nonumber \\ p * \left(1 + e^{a+bX} \right) & = e^{a+bX} \nonumber \\ p & = \frac {e^{a+bX}} { \left(1 + e^{a+bX} \right)} \\ \end{align}
install.packages("sigmoid")
library(sigmoid)
library(ggplot2)
input <- seq(-5, 5, 0.01)
df = data.frame(input, logistic(input), Gompertz(input))
ggplot( df, aes(input, logistic(input)) ) +
geom_line(color="red")
독립변인이 종류일 때에
IVs: categorical or numerical variables
DV: categorical variable
\begin{align} ln \left( {\frac{p}{(1-p)}} \right) & = a + bX \\ \end{align}
ilogit(a) = 0.4662912PF.yes) 같다. odds <- function(p) p/(1-p) odds.ratio <- function(p1, p2) odds(p1)/odds(p2) # log odds를 구하는 function logit <- function(p) log(p/(1-p)) # probability를 구하는 function # p = e^x/(1+e^x) ilogit <- function(x) exp(x)/(1+exp(x))
# 절편 해석 e^-0.13504/(1 + e^-0.13504) # 위의 절편에 대한 p 값 계산 # p = e^x/1+e^x summary(fit.ex6.2)$coefficient # coefficient 값들 출력 summary(fit.ex6.2)$coefficient[1,1] # 절편 값 출력 # or coef(fit.ex6.2)["(Intercept)"] # coef(fit.ex6.2)[1] p <- ilogit(summary(fit.ex6.2)$coefficient[1,1]) p # 이 값은 PF.yes 값과 같다 PF.yes
\begin{align*} ln(\frac{p}{1-p}) = & y \\ \frac {p}{1-p} = & e^{y} \;\;\; \text{where } \;\; y = a + bX \\ \text {odds} = & e^{y} = e^{a + bX} \\ \text{then} \;\;\; \text{odds ratio} (y_{2}/y_{1}) = & \text {odds ratio between } \\ & \text{odds of y at one point, } y_1 \text { and } \\ & \text{odds of y at another point, } y_2 \\ \text{and } y_1 = & a + b (X) \\ y_2 = & a + b (X+1) \\ \text{then } & \;\; \\ \text {odds of } y_1 = & e^{(a+b(X))} \\ \text {odds of } y_2 = & e^{(a+b(X+1))} \\ \text {odds ratio for } y_1 = & \frac {e^{(a+bX+b)} } {e^{(a+bX)}} \\ = & \frac {e^{(a+bX)} * e^{b}} {e^{(a+bX)} } \\ = & e^b \end{align*}
ilogit(0.2328) = 0.5579386summary(fit.ex6.2)$coefficient[2,1]om/of = 1.444613 orodds.ratio(pm, pf) = 1.444613> log(1.444613) [1] 0.3678415 # 이는 계수 값 b값이다. > b <- summary(fit.ex6.2)$coefficient[2,1] > b [1] 0.3678417 > e^b [1] 1.444613 >
nsduh <- nsduh %>%
mutate(demog_sex = relevel(demog_sex, ref = "Female"))
fit.ex6.2 <- glm(mj_lifetime ~ demog_sex,
family = binomial, data = nsduh)
summary(fit.ex6.2)
# install.packages("dplyr")
# library(dplyr)
> nsduh <- nsduh %>%
+ mutate(demog_sex = relevel(demog_sex, ref = "Female"))
> fit.ex6.2 <- glm(mj_lifetime ~ demog_sex,
+ family = binomial, data = nsduh)
> summary(fit.ex6.2)
Call:
glm(formula = mj_lifetime ~ demog_sex, family = binomial, data = nsduh)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.13504 0.08675 -1.557 0.11954
demog_sexMale 0.36784 0.12738 2.888 0.00388 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1386.0 on 999 degrees of freedom
Residual deviance: 1377.6 on 998 degrees of freedom
AIC: 1381.6
Number of Fisher Scoring iterations: 3
>
fit.ex6.2에서
# 위의 b값에 대한 CI을 구하기 위해서 confint 펑션을 사용한다 confint(fit.ex6.2) # b값에 대한 것만 알고 싶으므로 (drop = F는 confint(fit.ex6.2)[2, , drop=F] # 그리고 이 값들의 실제 odds ratio값을 보려면 exp(confint(fit.ex6.2)[2, , drop=F])
> # 위의 b값에 대한 CI을 구하기 위해서 confint 펑션을 사용한다
> confint(fit.ex6.2)
Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) -0.3054835 0.03475989
demog_sexMale 0.1185985 0.61808060
> # b값에 대한 것만 알고 싶으므로
> confint(fit.ex6.2)[2, , drop=F]
Waiting for profiling to be done...
2.5 % 97.5 %
demog_sexMale 0.1185985 0.6180806
> # 그리고 이 값들의 실제 odds ratio값을 보려면
> exp(confint(fit.ex6.2)[2, , drop=F])
Waiting for profiling to be done...
2.5 % 97.5 %
demog_sexMale 1.125918 1.855363
>
일반 regression에서 b값은 t-test를 했지만 여기서는 z-test를 (Wald test) 한다. 이는 IV가 종류이거나 숫자일 때 모두 마찬가지이다.
# install.packages("car")
# library(car)
# coefficient probability test
car::Anova(fit.ex6.2, type = 3, test.statistic = "Wald")
> # coefficient probability test
> car::Anova(fit.ex6.2, type = 3, test.statistic = "Wald")
Analysis of Deviance Table (Type III tests)
Response: mj_lifetime
Df Chisq Pr(>Chisq)
(Intercept) 1 2.4233 0.11954
demog_sex 1 8.3394 0.00388 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
마리화나의 사용경험에서 남성이 여성보다 큰 승산이 있다고 판단되었다 (Odds ratio (OR) = 1.44; 95% CI = 1.13, 1.86; p = .004). 남성은 여성보다 약 44% 더 사용경험을 할 승산을 보였다 (OR = 1.44).
######################################## # exercise head(df) table(df) # base 바꾸기 df.norm <- df %>% mutate(gene = relevel(gene, ref = "norm")) df.mut <- df %>% mutate(gene = relevel(gene, ref = "mutant")) logm.cancer.gene.1 <- glm(cancer ~ gene, family = binomial, data = df.norm) summary(logm.cancer.gene.1) a <- logm.cancer.gene.1$coefficients[1] b <- logm.cancer.gene.1$coefficients[2] a b a+b # when b = 0; 즉, mutant = 0 일 때 # log(odds.norm) = a 이므로 # odds.norm = e^a exp(a) # 확인 odds(p.can.norm) # odds.mut = e^(a+b) exp(a+b) odds(p.can.mut) # odds.ratio = e^(b) exp(b) odds.ratio(p.can.mut, p.can.norm) logm.cancer.gene.2 <- glm(cancer ~ gene, family = binomial, data = df.mut) summary(logm.cancer.gene.2) a <- logm.cancer.gene.2$coefficients[1] b <- logm.cancer.gene.2$coefficients[2] a b a+b # when b = 0; 즉, mutant = 0 일 때 # log(odds.norm) = a 이므로 # odds.norm = e^a exp(a) # 확인 odds(p.can.mut) # odds.mut = e^(a+b) exp(a+b) odds(p.can.norm) # odds.ratio = e^(b) exp(b) odds.ratio(p.can.norm, p.can.mut)
########################################
########################################
########################################
# numeric IV
fit.ex6.3 <- glm(mj_lifetime ~ alc_agefirst, family = binomial, data = nsduh)
round(summary(fit.ex6.3)$coef, 4)
ilogit(coef(fit.ex6.3)["(Intercept)"])
# 0.9952 = prob of starting marijuana when age is 0
# when the age is zero (intercept이므로)
# age = 0 에서 추정하는 것은 이상함
summary(nsduh$alc_agefirst)
# 위의 아웃풋에서 Mean값이 약 17이므로 17을
# 기준으로 하여 다시 보면
# install.packages("dplyr") # for %>% function
# library(dplyr)
# install.packages("tidyverse") # for mutate function
# library(tidyverse)
nsduh <- nsduh %>%
mutate(calc_agefirst = alc_agefirst - 17)
fit.ex6.3.centered <- glm(mj_lifetime ~ calc_agefirst,
family = binomial, data = nsduh)
fit.ex6.3.centered
ilogit(coef(fit.ex6.3.centered)["(Intercept)"])
# b coefficient
# 17살일 때를 기준으로 한살씩 증가할 때마다의
# 마리화나 경험/비경험의 Odds ratio는 -0.2835
# 이를 수치화하면
exp(coef(fit.ex6.3.centered)["calc_agefirst"])
# 이는 17이후에 한살씩 알콜처음 경험을 늦추면
# 마리화나 경험율 대 미경험 odds ratio가
# 0.247 낮아진다고 할 수 있다 (0.7531 증가는)
# 1-0.7531로 보는 것
# 그리고 이에 대한 CI를 보면 아래와 같고
confint(fit.ex6.3.centered)[2, , drop = F]
# 이를 승비로 (odds ratio) 보면
exp(confint(fit.ex6.3.centered)[2, , drop = F])
> ########################################
> ########################################
> ########################################
> # numeric IV
> fit.ex6.3 <- glm(mj_lifetime ~ alc_agefirst, family = binomial, data = nsduh)
> round(summary(fit.ex6.3)$coef, 4)
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.3407 0.4747 11.2510 0
alc_agefirst -0.2835 0.0267 -10.6181 0
>
> ilogit(coef(fit.ex6.3)["(Intercept)"])
(Intercept)
0.9952302
> # 0.9952 = prob of female starting marijuana
> # when the age is zero (intercept이므로)
>
> # age = 0 에서 추정하는 것은 이상함
> summary(nsduh$alc_agefirst)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
3.00 15.00 17.00 17.49 19.00 45.00 157
>
> # 위의 아웃풋에서 Mean값이 약 17이므로 17을
> # 기준으로 하여 다시 보면
>
> nsduh <- nsduh %>%
+ mutate(calc_agefirst = alc_agefirst - 17)
> fit.ex6.3.centered <- glm(mj_lifetime ~ calc_agefirst,
+ family = binomial, data = nsduh)
> fit.ex6.3.centered
Call: glm(formula = mj_lifetime ~ calc_agefirst, family = binomial,
data = nsduh)
Coefficients:
(Intercept) calc_agefirst
0.5207 -0.2835
Degrees of Freedom: 842 Total (i.e. Null); 841 Residual
(157 observations deleted due to missingness)
Null Deviance: 1141
Residual Deviance: 968.4 AIC: 972.4
> ilogit(coef(fit.ex6.3.centered)["(Intercept)"])
(Intercept)
0.6273001
>
> # b coefficient
> # 17살일 때를 기준으로 한살씩 증가할 때마다의
> # 마리화나 경험/비경험의 Odds ratio는 -0.2835
> # 이를 수치화하면
> exp(coef(fit.ex6.3.centered)["calc_agefirst"])
calc_agefirst
0.7531198
>
> # 이는 17이후에 한살씩 알콜처음 경험을 늦추면
> # 마리화나 경험율 대 미경험 odds ratio가
> # 0.247로 낮아진다고 할 수 있다 (0.7531 증가는)
> # 1-0.7531로 보는 것
>
> # 그리고 이에 대한 CI를 보면 아래와 같고
> confint(fit.ex6.3.centered)[2, , drop = F]
Waiting for profiling to be done...
2.5 % 97.5 %
calc_agefirst -0.3375819 -0.232863
> # 이를 승비로 (odds ratio) 보면
> exp(confint(fit.ex6.3.centered)[2, , drop = F])
Waiting for profiling to be done...
2.5 % 97.5 %
calc_agefirst 0.7134935 0.7922621
>
해석: 처음 알콜경험한 나이와 마리화나 처음경험과는 음의 상관관계를 보였다 (OR = 0.753; 95% CI = 0.713, 0.792; p < .001). 개인의 알콜경험 나이가 한 살씩 많아질 때마다 (가령 18살에서 19살로), 마리화나의 처음경험은 24.7% 낮아지는 것으로 판단이 되었다.
> #################################
> # 1년이 아니라 3년일 경우
> fit.ex6.3.centered
Call: glm(formula = mj_lifetime ~ calc_agefirst, family = binomial,
data = nsduh)
Coefficients:
(Intercept) calc_agefirst
0.5207 -0.2835
Degrees of Freedom: 842 Total (i.e. Null); 841 Residual
(157 observations deleted due to missingness)
Null Deviance: 1141
Residual Deviance: 968.4 AIC: 972.4
> coef(fit.ex6.3.centered)["calc_agefirst"]
calc_agefirst
-0.283531
> coef(fit.ex6.3.centered)["calc_agefirst"]*3
calc_agefirst
-0.850593
> exp(coef(fit.ex6.3.centered)["calc_agefirst"]*3)
calc_agefirst
0.4271616
>
CI는
> # CI 의 경우 아래와 같고
> confint(fit.ex6.3.centered)[2,]*3
Waiting for profiling to be done...
2.5 % 97.5 %
-1.012746 -0.698589
> # 이에 해당하는 값은
> exp(confint(fit.ex6.3.centered)[2,]*3)
Waiting for profiling to be done...
2.5 % 97.5 %
0.3632203 0.4972865
>
DV: lifetime marijuana use (mj_lifetime)
IVs:
fit.ex6.3.adj <- glm(mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex +
demog_income, family = binomial, data = nsduh)
# Regression coefficient table
round(summary(fit.ex6.3.adj)$coef, 4)
> #################################
> #################################
> ## Multiple regression
> #################################
> fit.ex6.3.adj <- glm(mj_lifetime ~ alc_agefirst +
+ demog_age_cat6 + demog_sex +
+ demog_income,
+ family = binomial, data = nsduh)
> # Regression coefficient table
> round(summary(fit.ex6.3.adj)$coef, 4)
Estimate Std. Error z value Pr(>|z|)
(Intercept) 6.2542 0.5914 10.5759 0.0000
alc_agefirst -0.2754 0.0276 -9.9922 0.0000
demog_age_cat626-34 -0.2962 0.3286 -0.9012 0.3675
demog_age_cat635-49 -0.8043 0.2966 -2.7120 0.0067
demog_age_cat650-64 -0.6899 0.2985 -2.3109 0.0208
demog_age_cat665+ -1.2748 0.3043 -4.1893 0.0000
demog_sexMale -0.0609 0.1618 -0.3763 0.7067
demog_income$20,000 - $49,999 -0.5309 0.2664 -1.9927 0.0463
demog_income$50,000 - $74,999 -0.0793 0.3049 -0.2601 0.7948
demog_income$75,000 or more -0.3612 0.2532 -1.4264 0.1538
>
# Regression coefficient table
round(summary(fit.ex6.3.adj)$coef, 4)
coef(fit.ex6.3.adj)
exp(coef(fit.ex6.3.adj))
col1 <- exp(coef(fit.ex6.3.adj))
confint(fit.ex6.3.adj)
exp(confint(fit.ex6.3.adj))
col2 <- exp(confint(fit.ex6.3.adj))
cbind("AdjOR" = col1, col2)[-1,]
round(cbind("AdjOR" = col1, col2)[-1,],3)
# OR은 coefficient 값을 이야기하는 것을 다시 확인
# 또한 Wald significant test 도 실행
car::Anova(fit.ex6.3.adj, type = 3, test.statistic = "Wald")
> # Regression coefficient table
> round(summary(fit.ex6.3.adj)$coef, 4)
Estimate Std. Error z value Pr(>|z|)
(Intercept) 6.2542 0.5914 10.5759 0.0000
alc_agefirst -0.2754 0.0276 -9.9922 0.0000
demog_age_cat626-34 -0.2962 0.3286 -0.9012 0.3675
demog_age_cat635-49 -0.8043 0.2966 -2.7120 0.0067
demog_age_cat650-64 -0.6899 0.2985 -2.3109 0.0208
demog_age_cat665+ -1.2748 0.3043 -4.1893 0.0000
demog_sexMale -0.0609 0.1618 -0.3763 0.7067
demog_income$20,000 - $49,999 -0.5309 0.2664 -1.9927 0.0463
demog_income$50,000 - $74,999 -0.0793 0.3049 -0.2601 0.7948
demog_income$75,000 or more -0.3612 0.2532 -1.4264 0.1538
> coef(fit.ex6.3.adj)
(Intercept) alc_agefirst
6.25417324 -0.27541454
demog_age_cat626-34 demog_age_cat635-49
-0.29618703 -0.80427437
demog_age_cat650-64 demog_age_cat665+
-0.68990572 -1.27475385
demog_sexMale demog_income$20,000 - $49,999
-0.06088993 -0.53087558
demog_income$50,000 - $74,999 demog_income$75,000 or more
-0.07930897 -0.36119745
> exp(coef(fit.ex6.3.adj))
(Intercept) alc_agefirst
520.1791313 0.7592573
demog_age_cat626-34 demog_age_cat635-49
0.7436483 0.4474125
demog_age_cat650-64 demog_age_cat665+
0.5016234 0.2794998
demog_sexMale demog_income$20,000 - $49,999
0.9409268 0.5880898
demog_income$50,000 - $74,999 demog_income$75,000 or more
0.9237545 0.6968414
> col1 <- exp(coef(fit.ex6.3.adj))
> confint(fit.ex6.3.adj)
Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) 5.1309585 7.45103372
alc_agefirst -0.3312435 -0.22314999
demog_age_cat626-34 -0.9490643 0.34307731
demog_age_cat635-49 -1.4002915 -0.23429671
demog_age_cat650-64 -1.2893673 -0.11559836
demog_age_cat665+ -1.8854986 -0.68944515
demog_sexMale -0.3790935 0.25566208
demog_income$20,000 - $49,999 -1.0591496 -0.01305382
demog_income$50,000 - $74,999 -0.6785210 0.51882750
demog_income$75,000 or more -0.8643471 0.13016735
> exp(confint(fit.ex6.3.adj))
Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) 169.1792047 1721.6419228
alc_agefirst 0.7180303 0.7999949
demog_age_cat626-34 0.3871031 1.4092777
demog_age_cat635-49 0.2465251 0.7911270
demog_age_cat650-64 0.2754450 0.8908329
demog_age_cat665+ 0.1517534 0.5018544
demog_sexMale 0.6844816 1.2913163
demog_income$20,000 - $49,999 0.3467506 0.9870310
demog_income$50,000 - $74,999 0.5073668 1.6800566
demog_income$75,000 or more 0.4213265 1.1390190
> col2 <- exp(confint(fit.ex6.3.adj))
Waiting for profiling to be done...
> cbind("AdjOR" = col1, col2)[-1,]
AdjOR 2.5 % 97.5 %
alc_agefirst 0.7592573 0.7180303 0.7999949
demog_age_cat626-34 0.7436483 0.3871031 1.4092777
demog_age_cat635-49 0.4474125 0.2465251 0.7911270
demog_age_cat650-64 0.5016234 0.2754450 0.8908329
demog_age_cat665+ 0.2794998 0.1517534 0.5018544
demog_sexMale 0.9409268 0.6844816 1.2913163
demog_income$20,000 - $49,999 0.5880898 0.3467506 0.9870310
demog_income$50,000 - $74,999 0.9237545 0.5073668 1.6800566
demog_income$75,000 or more 0.6968414 0.4213265 1.1390190
> round(cbind("AdjOR" = col1, col2)[-1,],3)
AdjOR 2.5 % 97.5 %
alc_agefirst 0.759 0.718 0.800
demog_age_cat626-34 0.744 0.387 1.409
demog_age_cat635-49 0.447 0.247 0.791
demog_age_cat650-64 0.502 0.275 0.891
demog_age_cat665+ 0.279 0.152 0.502
demog_sexMale 0.941 0.684 1.291
demog_income$20,000 - $49,999 0.588 0.347 0.987
demog_income$50,000 - $74,999 0.924 0.507 1.680
demog_income$75,000 or more 0.697 0.421 1.139
>
> # OR은 coefficient 값을 이야기하는 것을 다시 확인
> # 또한 Wald significant test 도 실행
>
> car::Anova(fit.ex6.3.adj, type = 3, test.statistic = "Wald")
Analysis of Deviance Table (Type III tests)
Response: mj_lifetime
Df Chisq Pr(>Chisq)
(Intercept) 1 111.8504 < 2.2e-16 ***
alc_agefirst 1 99.8435 < 2.2e-16 ***
demog_age_cat6 4 23.0107 0.000126 ***
demog_sex 1 0.1416 0.706685
demog_income 3 5.4449 0.141974
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Interpretation of the output:
Conclusion:
# install.packages("oddsratio")
# library(oddsratio)
fit_glm <- glm(admit ~ gre + gpa + rank, data = data_glm, family = "binomial")
summary(fit_glm)
# Calculate OR for specific increment step of continuous variable
or_glm(data = data_glm, model = fit_glm,
incr = list(gre = 40, gpa = .1))
https://stats.idre.ucla.edu/r/dae/logit-regression/
mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
## view the first few rows of the data
head(mydata)
admit gre gpa rank 1 0 380 3.61 3 2 1 660 3.67 3 3 1 800 4.00 1 4 1 640 3.19 4 5 0 520 2.93 4 6 1 760 3.00 2
summary(mydata)
admit gre gpa rank Min. :0.0000 Min. :220.0 Min. :2.260 Min. :1.000 1st Qu.:0.0000 1st Qu.:520.0 1st Qu.:3.130 1st Qu.:2.000 Median :0.0000 Median :580.0 Median :3.395 Median :2.000 Mean :0.3175 Mean :587.7 Mean :3.390 Mean :2.485 3rd Qu.:1.0000 3rd Qu.:660.0 3rd Qu.:3.670 3rd Qu.:3.000 Max. :1.0000 Max. :800.0 Max. :4.000 Max. :4.000
sapply(mydata, mean) sapply(mydata, sd)
> sapply(mydata, mean)
admit gre gpa rank
0.3175 587.7000 3.3899 2.4850
> sapply(mydata, sd)
admit gre gpa rank
0.4660867 115.5165364 0.3805668 0.9444602
>
xtabs(~admit + rank, data = mydata)
> xtabs(~admit + rank, data = mydata)
rank
admit 1 2 3 4
0 28 97 93 55
1 33 54 28 12
mydata$rank <- factor(mydata$rank) mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial") summary(mylogit)
Call:
glm(formula = admit ~ gre + gpa + rank, family = "binomial",
data = mydata)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.6268 -0.8662 -0.6388 1.1490 2.0790
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.989979 1.139951 -3.500 0.000465 ***
gre 0.002264 0.001094 2.070 0.038465 *
gpa 0.804038 0.331819 2.423 0.015388 *
rank2 -0.675443 0.316490 -2.134 0.032829 *
rank3 -1.340204 0.345306 -3.881 0.000104 ***
rank4 -1.551464 0.417832 -3.713 0.000205 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 499.98 on 399 degrees of freedom
Residual deviance: 458.52 on 394 degrees of freedom
AIC: 470.52
Number of Fisher Scoring iterations: 4
>
> confint(mylogit)
Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) -6.2716202334 -1.792547080
gre 0.0001375921 0.004435874
gpa 0.1602959439 1.464142727
rank2 -1.3008888002 -0.056745722
rank3 -2.0276713127 -0.670372346
rank4 -2.4000265384 -0.753542605
>
> ## CIs using standard errors
> confint.default(mylogit)
2.5 % 97.5 %
(Intercept) -6.2242418514 -1.755716295
gre 0.0001202298 0.004408622
gpa 0.1536836760 1.454391423
rank2 -1.2957512650 -0.055134591
rank3 -2.0169920597 -0.663415773
rank4 -2.3703986294 -0.732528724
wald.test(b = coef(mylogit), Sigma = vcov(mylogit), Terms = 4:6)
l <- cbind(0, 0, 0, 1, -1, 0) wald.test(b = coef(mylogit), Sigma = vcov(mylogit), L = l)
Logistic Regression Tutorial
\begin{align} y = b_{0} + b_{1}x \\ p = \frac{1} {1 + e^{-y}} \\ ln(\frac{p}{1-p}) = b_{0} + b_{1}x \\ \end{align}
Logistic Regression Details Pt1: Coefficients
Logistic Regression Details Pt 2: Maximum Likelihood
Logistic Regression Details Pt 3: R-squared and p-value