Table of Contents

Logistic Regression

https://www.bookdown.org/rwnahhas/RMPH/blr-orlr.html
data: https://www.bookdown.org/rwnahhas/RMPH/appendix-nsduh.html#appendix-nsduh

Data preparation

getwd()
# 보통 C:/Users/Username/Documents/ 
setwd("~/rData")

Odds

Odds (승산): 한 사건이 일어날 확률과 그 반대의 확률 (일어나지 않을 확률) 간의 비율 한 사건이 일어날 확률과 다름에 주의하라.
\begin{align} \displaystyle \frac {p} {1-p} \end{align}

Odds ratio

Odds ratio (승비): Odds ratio는 두 odds 간의 비율을 말한다.

n <- 350
p.cancer <- 0.08
p.mutant <- 0.39

c <- runif(n, 0, 1)
canc <- ifelse(c>=p.cancer, "nocancer", "cancer")
c <- runif(n, 0, 1)
gene <- ifelse(c>=p.mutant, "norm", "mutated")

da <- data.frame(gene, canc)
da
tab <- table(da)
tab

Logit 성질

여기서
\begin{align*} y & = ln(x) \\ & = log_e {x} \\ x & = e^{y} \\ \end{align*}

위에서

> load("nsduh2019_adult_sub_rmph.RData")
> # Shorter name
> nsduh <- nsduh_adult_sub
> tab <- table(nsduh$demog_sex, nsduh$mj_lifetime)
> tab
        
          No Yes
  Male   206 260
  Female 285 249
> 
# Marijuana experience (me) in lifetime
	NO	YES	
Male	206	260	466
Female	285	249	534
	491	509	1000

P(me among males) = 260 / 466 = 0.5579399
P(me among females) = 249 / 534 = 0.4662921
Odds for males = 260 / 206 = 1.262136
Odds for females = 249 / 285 = 0.8736842
Odds ratio between males and females = (260 / 206) / (249 / 285) = 1.262136 / 0.8736842 = 1.444613
odds       <- function(p)      p/(1-p)
odds.ratio <- function(p1, p2) odds(p1)/odds(p2)
logit      <- function(p)      log(p/(1-p))
ilogit     <- function(x)      exp(x)/(1+exp(x))
# exp() is the exponential function
pm <- tab[1,2]/(tab[1,1]+tab[1,2])
pf <- tab[2,2]/(tab[2,1]+tab[2,2]) 
om <- odds(pm)
of <- odds(pf)
ormf <- odds.ratio(pm,pf)
pm
pf
om
of
ormf

> pm
[1] 0.5579399
> pf
[1] 0.4662921
> om
[1] 1.262136
> of
[1] 0.8736842
> ormf
[1] 1.444613
> 

x1 <- logit(pm)
x2 <- logit(pf)
x1
x2
ilogit(x1)
ilogit(x2)

> x1 <- logit(pm)
> x2 <- logit(pf)
> x1
[1] 0.2328055
> x2
[1] -0.1350363
> ilogit(x1)
[1] 0.5579399
> ilogit(x2)
[1] 0.4662921
> 
> 

Odds ratio in logistic

\begin{align*} ln(\frac{p}{1-p}) = & y \\ \frac {p}{1-p} = & e^{y} \;\;\; \text{where } \;\; y = a + bX \\ \text {odds} = & e^{y} = e^{a + bX} \\ \text{then} \;\;\; \text{odds ratio} (y_{2}/y_{1}) = & \text {odds ratio between } \\ & \text{odds of y at one point, } y_1 \text { and } \\ & \text{odds of y at another point, } y_2 \\ \text{and } y_1 = & a + b (X) \\ y_2 = & a + b (X+1) \\ \text{then } & \;\; \\ \text {odds of } y_1 = & e^{(a+b(X))} \\ \text {odds of } y_2 = & e^{(a+b(X+1))} \\ \text {odds ratio for } y_1 = & \frac {e^{(a+bX+b)} } {e^{(a+bX)}} \\ = & \frac {e^{(a+bX)} * e^{b}} {e^{(a+bX)} } \\ = & e^b \end{align*}

Logitistic Regression Analysis

\begin{align*} \displaystyle ln \left( {\frac{p}{(1-p)}} \right) = a + bX \end{align*}

\begin{align} ln \left( {\frac{p}{1-p}} \right) & = a + bX \nonumber \\ \frac{p}{1-p} & = e^{a+bX} \nonumber \\ p & = e^{a+bX} * (1-p) \nonumber \\ p & = e^{a+bX} - p * \left(e^{a+bX} \right) \nonumber \\ p + p * \left(e^{a+bX} \right) & = e^{a+bX} \nonumber \\ p * \left(1 + e^{a+bX} \right) & = e^{a+bX} \nonumber \\ p & = \frac {e^{a+bX}} { \left(1 + e^{a+bX} \right)} \\ \end{align}

install.packages("sigmoid")
library(sigmoid)
library(ggplot2)
input <- seq(-5, 5, 0.01)
df = data.frame(input, logistic(input), Gompertz(input))
ggplot( df, aes(input, logistic(input)) ) + 
  geom_line(color="red")

Binary Logistic Regression

독립변인이 종류일 때에
IVs: categorical or numerical variables
DV: categorical variable

\begin{align} ln \left( {\frac{p}{(1-p)}} \right) & = a + bX \\ \end{align}

intercept (절편) 해석

odds       <- function(p)      p/(1-p)
odds.ratio <- function(p1, p2) odds(p1)/odds(p2)
# log odds를 구하는 function
logit      <- function(p)      log(p/(1-p))
# probability를 구하는 function 
# p = e^x/(1+e^x)
ilogit     <- function(x)      exp(x)/(1+exp(x))
# 절편 해석 
e^-0.13504/(1 + e^-0.13504) # 위의 절편에 대한 p 값 계산 
# p = e^x/1+e^x
summary(fit.ex6.2)$coefficient # coefficient 값들 출력
summary(fit.ex6.2)$coefficient[1,1] # 절편 값 출력
# or 
coef(fit.ex6.2)["(Intercept)"]
# coef(fit.ex6.2)[1]
p <- ilogit(summary(fit.ex6.2)$coefficient[1,1]) 
p 
# 이 값은 PF.yes 값과 같다
PF.yes

coefficient (계수) 해석

> log(1.444613)
[1] 0.3678415 # 이는 계수 값 b값이다. 

> b <- summary(fit.ex6.2)$coefficient[2,1]
> b
[1] 0.3678417
> e^b
[1] 1.444613
> 

glm in R

nsduh <- nsduh %>% 
  mutate(demog_sex = relevel(demog_sex, ref = "Female"))

fit.ex6.2 <- glm(mj_lifetime ~ demog_sex,
                 family = binomial, data = nsduh)
summary(fit.ex6.2)
# install.packages("dplyr")
# library(dplyr)
> nsduh <- nsduh %>% 
+   mutate(demog_sex = relevel(demog_sex, ref = "Female"))
> fit.ex6.2 <- glm(mj_lifetime ~ demog_sex,
+                  family = binomial, data = nsduh)
> summary(fit.ex6.2)

Call:
glm(formula = mj_lifetime ~ demog_sex, family = binomial, data = nsduh)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)   
(Intercept)   -0.13504    0.08675  -1.557  0.11954   
demog_sexMale  0.36784    0.12738   2.888  0.00388 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1386.0  on 999  degrees of freedom
Residual deviance: 1377.6  on 998  degrees of freedom
AIC: 1381.6

Number of Fisher Scoring iterations: 3

> 

CI of b coefficient

fit.ex6.2에서

# 위의 b값에 대한 CI을 구하기 위해서 confint 펑션을 사용한다
confint(fit.ex6.2)
# b값에 대한 것만 알고 싶으므로 (drop = F는 
confint(fit.ex6.2)[2, , drop=F]
# 그리고 이 값들의 실제 odds ratio값을 보려면 
exp(confint(fit.ex6.2)[2, , drop=F])
> # 위의 b값에 대한 CI을 구하기 위해서 confint 펑션을 사용한다
> confint(fit.ex6.2)
Waiting for profiling to be done...
                   2.5 %     97.5 %
(Intercept)   -0.3054835 0.03475989
demog_sexMale  0.1185985 0.61808060
> # b값에 대한 것만 알고 싶으므로 
> confint(fit.ex6.2)[2, , drop=F]
Waiting for profiling to be done...
                  2.5 %    97.5 %
demog_sexMale 0.1185985 0.6180806
> # 그리고 이 값들의 실제 odds ratio값을 보려면 
> exp(confint(fit.ex6.2)[2, , drop=F])
Waiting for profiling to be done...
                 2.5 %   97.5 %
demog_sexMale 1.125918 1.855363
> 

coefficient값에 대한 테스트

일반 regression에서 b값은 t-test를 했지만 여기서는 z-test를 (Wald test) 한다. 이는 IV가 종류이거나 숫자일 때 모두 마찬가지이다.

# install.packages("car")
# library(car)
# coefficient probability test
car::Anova(fit.ex6.2, type = 3, test.statistic = "Wald")
> # coefficient probability test
> car::Anova(fit.ex6.2, type = 3, test.statistic = "Wald")
Analysis of Deviance Table (Type III tests)

Response: mj_lifetime
            Df  Chisq Pr(>Chisq)   
(Intercept)  1 2.4233    0.11954   
demog_sex    1 8.3394    0.00388 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> 

마리화나의 사용경험에서 남성이 여성보다 큰 승산이 있다고 판단되었다 (Odds ratio (OR) = 1.44; 95% CI = 1.13, 1.86; p = .004). 남성은 여성보다 약 44% 더 사용경험을 할 승산을 보였다 (OR = 1.44).

X: numeric variable

########################################
########################################
########################################
# numeric IV
fit.ex6.3 <- glm(mj_lifetime ~ alc_agefirst, family = binomial, data = nsduh)
round(summary(fit.ex6.3)$coef, 4)

ilogit(coef(fit.ex6.3)["(Intercept)"])
# 0.9952 = prob of starting marijuana when age is 0
# when the age is zero (intercept이므로)

# age = 0 에서 추정하는 것은 이상함 
summary(nsduh$alc_agefirst)

# 위의 아웃풋에서 Mean값이 약 17이므로 17을 
# 기준으로 하여 다시 보면
# install.packages("dplyr") # for %>% function
# library(dplyr)
# install.packages("tidyverse") # for mutate function
# library(tidyverse)


nsduh <- nsduh %>% 
  mutate(calc_agefirst = alc_agefirst - 17)
fit.ex6.3.centered <- glm(mj_lifetime ~ calc_agefirst,
                          family = binomial, data = nsduh)
fit.ex6.3.centered
ilogit(coef(fit.ex6.3.centered)["(Intercept)"])

# b coefficient 
# 17살일 때를 기준으로 한살씩 증가할 때마다의 
# 마리화나 경험/비경험의 Odds ratio는  -0.2835
# 이를 수치화하면 
exp(coef(fit.ex6.3.centered)["calc_agefirst"])

# 이는 17이후에 한살씩 알콜처음 경험을 늦추면 
# 마리화나 경험율 대 미경험 odds ratio가 
# 0.247 낮아진다고 할 수 있다 (0.7531 증가는)
# 1-0.7531로 보는 것

# 그리고 이에 대한 CI를 보면 아래와 같고
confint(fit.ex6.3.centered)[2, , drop = F]
# 이를 승비로 (odds ratio) 보면 
exp(confint(fit.ex6.3.centered)[2, , drop = F])
> ########################################
> ########################################
> ########################################
> # numeric IV
> fit.ex6.3 <- glm(mj_lifetime ~ alc_agefirst, family = binomial, data = nsduh)
> round(summary(fit.ex6.3)$coef, 4)
             Estimate Std. Error  z value Pr(>|z|)
(Intercept)    5.3407     0.4747  11.2510        0
alc_agefirst  -0.2835     0.0267 -10.6181        0
> 
> ilogit(coef(fit.ex6.3)["(Intercept)"])
(Intercept) 
  0.9952302 
> # 0.9952 = prob of female starting marijuana 
> # when the age is zero (intercept이므로)
> 
> # age = 0 에서 추정하는 것은 이상함 
> summary(nsduh$alc_agefirst)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   3.00   15.00   17.00   17.49   19.00   45.00     157 
> 
> # 위의 아웃풋에서 Mean값이 약 17이므로 17을 
> # 기준으로 하여 다시 보면
> 
> nsduh <- nsduh %>% 
+   mutate(calc_agefirst = alc_agefirst - 17)
> fit.ex6.3.centered <- glm(mj_lifetime ~ calc_agefirst,
+                           family = binomial, data = nsduh)
> fit.ex6.3.centered

Call:  glm(formula = mj_lifetime ~ calc_agefirst, family = binomial, 
    data = nsduh)

Coefficients:
  (Intercept)  calc_agefirst  
       0.5207        -0.2835  

Degrees of Freedom: 842 Total (i.e. Null);  841 Residual
  (157 observations deleted due to missingness)
Null Deviance:	    1141 
Residual Deviance: 968.4 	AIC: 972.4
> ilogit(coef(fit.ex6.3.centered)["(Intercept)"])
(Intercept) 
  0.6273001 
> 
> # b coefficient 
> # 17살일 때를 기준으로 한살씩 증가할 때마다의 
> # 마리화나 경험/비경험의 Odds ratio는  -0.2835
> # 이를 수치화하면 
> exp(coef(fit.ex6.3.centered)["calc_agefirst"])
calc_agefirst 
    0.7531198 
> 
> # 이는 17이후에 한살씩 알콜처음 경험을 늦추면 
> # 마리화나 경험율 대 미경험 odds ratio가 
> # 0.247로 낮아진다고 할 수 있다 (0.7531 증가는)
> # 1-0.7531로 보는 것
> 
> # 그리고 이에 대한 CI를 보면 아래와 같고
> confint(fit.ex6.3.centered)[2, , drop = F]
Waiting for profiling to be done...
                   2.5 %    97.5 %
calc_agefirst -0.3375819 -0.232863
> # 이를 승비로 (odds ratio) 보면 
> exp(confint(fit.ex6.3.centered)[2, , drop = F])
Waiting for profiling to be done...
                  2.5 %    97.5 %
calc_agefirst 0.7134935 0.7922621
> 

해석: 처음 알콜경험한 나이와 마리화나 처음경험과는 음의 상관관계를 보였다 (OR = 0.753; 95% CI = 0.713, 0.792; p < .001). 개인의 알콜경험 나이가 한 살씩 많아질 때마다 (가령 18살에서 19살로), 마리화나의 처음경험은 24.7% 낮아지는 것으로 판단이 되었다.

IV increase not by one, but by many

> #################################
> # 1년이 아니라 3년일 경우
> fit.ex6.3.centered

Call:  glm(formula = mj_lifetime ~ calc_agefirst, family = binomial, 
    data = nsduh)

Coefficients:
  (Intercept)  calc_agefirst  
       0.5207        -0.2835  

Degrees of Freedom: 842 Total (i.e. Null);  841 Residual
  (157 observations deleted due to missingness)
Null Deviance:	    1141 
Residual Deviance: 968.4 	AIC: 972.4
> coef(fit.ex6.3.centered)["calc_agefirst"]
calc_agefirst 
    -0.283531 
> coef(fit.ex6.3.centered)["calc_agefirst"]*3
calc_agefirst 
    -0.850593 
> exp(coef(fit.ex6.3.centered)["calc_agefirst"]*3)
calc_agefirst 
    0.4271616 
> 

CI는

> # CI 의 경우 아래와 같고
> confint(fit.ex6.3.centered)[2,]*3
Waiting for profiling to be done...
    2.5 %    97.5 % 
-1.012746 -0.698589 
> # 이에 해당하는 값은 
> exp(confint(fit.ex6.3.centered)[2,]*3)
Waiting for profiling to be done...
    2.5 %    97.5 % 
0.3632203 0.4972865 
> 

Multiple Regression

DV: lifetime marijuana use (mj_lifetime)
IVs:

fit.ex6.3.adj <- glm(mj_lifetime ~ alc_agefirst + demog_age_cat6 + demog_sex +
                     demog_income, family = binomial, data = nsduh)
# Regression coefficient table
round(summary(fit.ex6.3.adj)$coef, 4)
> #################################
> #################################
> ## Multiple regression
> #################################
> fit.ex6.3.adj <- glm(mj_lifetime ~ alc_agefirst + 
+                        demog_age_cat6 + demog_sex +
+                        demog_income, 
+                      family = binomial, data = nsduh)
> # Regression coefficient table
> round(summary(fit.ex6.3.adj)$coef, 4)
                              Estimate Std. Error z value Pr(>|z|)
(Intercept)                     6.2542     0.5914 10.5759   0.0000
alc_agefirst                   -0.2754     0.0276 -9.9922   0.0000
demog_age_cat626-34            -0.2962     0.3286 -0.9012   0.3675
demog_age_cat635-49            -0.8043     0.2966 -2.7120   0.0067
demog_age_cat650-64            -0.6899     0.2985 -2.3109   0.0208
demog_age_cat665+              -1.2748     0.3043 -4.1893   0.0000
demog_sexMale                  -0.0609     0.1618 -0.3763   0.7067
demog_income$20,000 - $49,999  -0.5309     0.2664 -1.9927   0.0463
demog_income$50,000 - $74,999  -0.0793     0.3049 -0.2601   0.7948
demog_income$75,000 or more    -0.3612     0.2532 -1.4264   0.1538
> 
# Regression coefficient table
round(summary(fit.ex6.3.adj)$coef, 4)
coef(fit.ex6.3.adj)
exp(coef(fit.ex6.3.adj))
col1 <- exp(coef(fit.ex6.3.adj))
confint(fit.ex6.3.adj)
exp(confint(fit.ex6.3.adj))
col2 <- exp(confint(fit.ex6.3.adj))
cbind("AdjOR" = col1, col2)[-1,]
round(cbind("AdjOR" = col1, col2)[-1,],3)

# OR은 coefficient 값을 이야기하는 것을 다시 확인
# 또한 Wald significant test 도 실행 
car::Anova(fit.ex6.3.adj, type = 3, test.statistic = "Wald")
> # Regression coefficient table
> round(summary(fit.ex6.3.adj)$coef, 4)
                              Estimate Std. Error z value Pr(>|z|)
(Intercept)                     6.2542     0.5914 10.5759   0.0000
alc_agefirst                   -0.2754     0.0276 -9.9922   0.0000
demog_age_cat626-34            -0.2962     0.3286 -0.9012   0.3675
demog_age_cat635-49            -0.8043     0.2966 -2.7120   0.0067
demog_age_cat650-64            -0.6899     0.2985 -2.3109   0.0208
demog_age_cat665+              -1.2748     0.3043 -4.1893   0.0000
demog_sexMale                  -0.0609     0.1618 -0.3763   0.7067
demog_income$20,000 - $49,999  -0.5309     0.2664 -1.9927   0.0463
demog_income$50,000 - $74,999  -0.0793     0.3049 -0.2601   0.7948
demog_income$75,000 or more    -0.3612     0.2532 -1.4264   0.1538
> coef(fit.ex6.3.adj)
                  (Intercept)                  alc_agefirst 
                   6.25417324                   -0.27541454 
          demog_age_cat626-34           demog_age_cat635-49 
                  -0.29618703                   -0.80427437 
          demog_age_cat650-64             demog_age_cat665+ 
                  -0.68990572                   -1.27475385 
                demog_sexMale demog_income$20,000 - $49,999 
                  -0.06088993                   -0.53087558 
demog_income$50,000 - $74,999   demog_income$75,000 or more 
                  -0.07930897                   -0.36119745 
> exp(coef(fit.ex6.3.adj))
                  (Intercept)                  alc_agefirst 
                  520.1791313                     0.7592573 
          demog_age_cat626-34           demog_age_cat635-49 
                    0.7436483                     0.4474125 
          demog_age_cat650-64             demog_age_cat665+ 
                    0.5016234                     0.2794998 
                demog_sexMale demog_income$20,000 - $49,999 
                    0.9409268                     0.5880898 
demog_income$50,000 - $74,999   demog_income$75,000 or more 
                    0.9237545                     0.6968414 
> col1 <- exp(coef(fit.ex6.3.adj))
> confint(fit.ex6.3.adj)
Waiting for profiling to be done...
                                   2.5 %      97.5 %
(Intercept)                    5.1309585  7.45103372
alc_agefirst                  -0.3312435 -0.22314999
demog_age_cat626-34           -0.9490643  0.34307731
demog_age_cat635-49           -1.4002915 -0.23429671
demog_age_cat650-64           -1.2893673 -0.11559836
demog_age_cat665+             -1.8854986 -0.68944515
demog_sexMale                 -0.3790935  0.25566208
demog_income$20,000 - $49,999 -1.0591496 -0.01305382
demog_income$50,000 - $74,999 -0.6785210  0.51882750
demog_income$75,000 or more   -0.8643471  0.13016735
> exp(confint(fit.ex6.3.adj))
Waiting for profiling to be done...
                                    2.5 %       97.5 %
(Intercept)                   169.1792047 1721.6419228
alc_agefirst                    0.7180303    0.7999949
demog_age_cat626-34             0.3871031    1.4092777
demog_age_cat635-49             0.2465251    0.7911270
demog_age_cat650-64             0.2754450    0.8908329
demog_age_cat665+               0.1517534    0.5018544
demog_sexMale                   0.6844816    1.2913163
demog_income$20,000 - $49,999   0.3467506    0.9870310
demog_income$50,000 - $74,999   0.5073668    1.6800566
demog_income$75,000 or more     0.4213265    1.1390190
> col2 <- exp(confint(fit.ex6.3.adj))
Waiting for profiling to be done...
> cbind("AdjOR" = col1, col2)[-1,]
                                  AdjOR     2.5 %    97.5 %
alc_agefirst                  0.7592573 0.7180303 0.7999949
demog_age_cat626-34           0.7436483 0.3871031 1.4092777
demog_age_cat635-49           0.4474125 0.2465251 0.7911270
demog_age_cat650-64           0.5016234 0.2754450 0.8908329
demog_age_cat665+             0.2794998 0.1517534 0.5018544
demog_sexMale                 0.9409268 0.6844816 1.2913163
demog_income$20,000 - $49,999 0.5880898 0.3467506 0.9870310
demog_income$50,000 - $74,999 0.9237545 0.5073668 1.6800566
demog_income$75,000 or more   0.6968414 0.4213265 1.1390190
> round(cbind("AdjOR" = col1, col2)[-1,],3)
                              AdjOR 2.5 % 97.5 %
alc_agefirst                  0.759 0.718  0.800
demog_age_cat626-34           0.744 0.387  1.409
demog_age_cat635-49           0.447 0.247  0.791
demog_age_cat650-64           0.502 0.275  0.891
demog_age_cat665+             0.279 0.152  0.502
demog_sexMale                 0.941 0.684  1.291
demog_income$20,000 - $49,999 0.588 0.347  0.987
demog_income$50,000 - $74,999 0.924 0.507  1.680
demog_income$75,000 or more   0.697 0.421  1.139
> 
> # OR은 coefficient 값을 이야기하는 것을 다시 확인
> # 또한 Wald significant test 도 실행 
> 
> car::Anova(fit.ex6.3.adj, type = 3, test.statistic = "Wald")
Analysis of Deviance Table (Type III tests)

Response: mj_lifetime
               Df    Chisq Pr(>Chisq)    
(Intercept)     1 111.8504  < 2.2e-16 ***
alc_agefirst    1  99.8435  < 2.2e-16 ***
demog_age_cat6  4  23.0107   0.000126 ***
demog_sex       1   0.1416   0.706685    
demog_income    3   5.4449   0.141974    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Interpretation of the output:

Conclusion:

e.g. 1

# install.packages("oddsratio")
# library(oddsratio)
fit_glm <- glm(admit ~ gre + gpa + rank, data = data_glm, family = "binomial") 

summary(fit_glm)



# Calculate OR for specific increment step of continuous variable
or_glm(data = data_glm, model = fit_glm, 
       incr = list(gre = 40, gpa = .1))

e.g. 2

https://stats.idre.ucla.edu/r/dae/logit-regression/

mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
## view the first few rows of the data
head(mydata)
  admit gre  gpa rank
1     0 380 3.61    3
2     1 660 3.67    3
3     1 800 4.00    1
4     1 640 3.19    4
5     0 520 2.93    4
6     1 760 3.00    2
summary(mydata)
     admit             gre             gpa             rank      
 Min.   :0.0000   Min.   :220.0   Min.   :2.260   Min.   :1.000  
 1st Qu.:0.0000   1st Qu.:520.0   1st Qu.:3.130   1st Qu.:2.000  
 Median :0.0000   Median :580.0   Median :3.395   Median :2.000  
 Mean   :0.3175   Mean   :587.7   Mean   :3.390   Mean   :2.485  
 3rd Qu.:1.0000   3rd Qu.:660.0   3rd Qu.:3.670   3rd Qu.:3.000  
 Max.   :1.0000   Max.   :800.0   Max.   :4.000   Max.   :4.000  
sapply(mydata, mean)
sapply(mydata, sd)
> sapply(mydata, mean)
   admit      gre      gpa     rank 
  0.3175 587.7000   3.3899   2.4850 
> sapply(mydata, sd)
      admit         gre         gpa        rank 
  0.4660867 115.5165364   0.3805668   0.9444602 
> 
xtabs(~admit + rank, data = mydata)
> xtabs(~admit + rank, data = mydata)
     rank
admit  1  2  3  4
    0 28 97 93 55
    1 33 54 28 12
mydata$rank <- factor(mydata$rank)
mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")
summary(mylogit)
Call:
glm(formula = admit ~ gre + gpa + rank, family = "binomial", 
    data = mydata)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6268  -0.8662  -0.6388   1.1490   2.0790  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -3.989979   1.139951  -3.500 0.000465 ***
gre          0.002264   0.001094   2.070 0.038465 *  
gpa          0.804038   0.331819   2.423 0.015388 *  
rank2       -0.675443   0.316490  -2.134 0.032829 *  
rank3       -1.340204   0.345306  -3.881 0.000104 ***
rank4       -1.551464   0.417832  -3.713 0.000205 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 499.98  on 399  degrees of freedom
Residual deviance: 458.52  on 394  degrees of freedom
AIC: 470.52

Number of Fisher Scoring iterations: 4

> 
> confint(mylogit)
Waiting for profiling to be done...
                    2.5 %       97.5 %
(Intercept) -6.2716202334 -1.792547080
gre          0.0001375921  0.004435874
gpa          0.1602959439  1.464142727
rank2       -1.3008888002 -0.056745722
rank3       -2.0276713127 -0.670372346
rank4       -2.4000265384 -0.753542605
> 
> ## CIs using standard errors
> confint.default(mylogit)
                    2.5 %       97.5 %
(Intercept) -6.2242418514 -1.755716295
gre          0.0001202298  0.004408622
gpa          0.1536836760  1.454391423
rank2       -1.2957512650 -0.055134591
rank3       -2.0169920597 -0.663415773
rank4       -2.3703986294 -0.732528724
wald.test(b = coef(mylogit), Sigma = vcov(mylogit), Terms = 4:6)
l <- cbind(0, 0, 0, 1, -1, 0)
wald.test(b = coef(mylogit), Sigma = vcov(mylogit), L = l)

관련 동영상

Log의 성질

Logistic Regression Tutorial

\begin{align} y = b_{0} + b_{1}x \\ p = \frac{1} {1 + e^{-y}} \\ ln(\frac{p}{1-p}) = b_{0} + b_{1}x \\ \end{align}

Logistic Regression Details Pt1: Coefficients

Logistic Regression Details Pt 2: Maximum Likelihood

Logistic Regression Details Pt 3: R-squared and p-value