partial_and_semipartial_correlation
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revisionNext revisionBoth sides next revision | ||
partial_and_semipartial_correlation [2019/10/13 00:03] – [regression gpa against sat] hkimscil | partial_and_semipartial_correlation [2023/05/31 08:56] – [e.g., 독립변인 들이 서로 독립적일 때의 각각의 설명력] hkimscil | ||
---|---|---|---|
Line 2: | Line 2: | ||
references | references | ||
{{https:// | {{https:// | ||
+ | or [[https:// | ||
+ | ====== Partial and semipartial ====== | ||
+ | < | ||
+ | options(digits = 4) | ||
+ | HSGPA <- c(3.0, 3.2, 2.8, 2.5, 3.2, 3.8, 3.9, 3.8, 3.5, 3.1) | ||
+ | FGPA <- c(2.8, 3.0, 2.8, 2.2, 3.3, 3.3, 3.5, 3.7, 3.4, 2.9) | ||
+ | SATV <- c(500, 550, 450, 400, 600, 650, 700, 550, 650, 550) | ||
+ | scholar <- data.frame(FGPA, | ||
+ | describe(scholar) # provides descrptive information about each variable | ||
- | Simple explanation of the below procedures is like this: | + | corrs <- cor(scholar) # find the correlations |
- | * Separately regress Y and X1 against X2, that is, | + | corrs # print corrs |
- | * regress Y against X2 AND | + | |
- | * regress X1 against X2. | + | |
- | * Regress the Y residuals against the X1 residuals. | + | |
- | In the below example, | + | |
- | * regress gpa against sat | + | |
- | * regress clep against sat | + | |
- | * regress the gpa residuals against clep residuals. | + | |
- | Take a close look at the graphs, especially, the grey areas. | + | |
- | For more, see https:// | + | pairs(scholar) |
+ | </code> | ||
- | - sat, clep이 각각 | + | < |
- | | + | attach(scholar) |
- | - lm.gpa.clepsat 를 구해서 sat의 영향력이 사라지는 것을 본다. | + | # freshman' |
- | - < | + | mod.all <- lm(FGPA ~ HSGPA+ SATV, data = scholar) |
- | - sat에는 clep과 관련이 있는 | + | summary(mod.all) |
- | - 만약에 독립변인인 sat와 clep이 orthogonal하다면 (즉, 상관관계가 0이라면), 스트라이크 아웃된 부분이 가능하겠지만 그렇지 않기에 sat에서 clep의 부분을 제거한 부분을 구해서 즉, res.lm.sat.clep을 구해서 res.lm.gpa.clep와의 상관관계를 본다. | + | </ |
+ | < | ||
+ | > summary(mod.all) | ||
- | ====== Partial cor ====== | + | Call: |
- | please refer to the page: http:// | + | lm(formula |
- | {{: | + | |
- | | Person | + | |
- | | 1 | 500 | 30 | 2.8 | | + | |
- | | 2 | 550 | 32 | 3 | | + | |
- | | 3 | 450 | 28 | 2.9 | | + | |
- | | 4 | 400 | 25 | 2.8 | | + | |
- | | 5 | 600 | 32 | 3.3 | | + | |
- | | 6 | 650 | 38 | 3.3 | | + | |
- | | 7 | 700 | 39 | 3.5 | | + | |
- | | 8 | 550 | 38 | 3.7 | | + | |
- | | 9 | 650 | 35 | 3.4 | | + | |
- | | 10 | 550 | 31 | 2.9 | | + | |
- | < | + | Residuals: |
+ | Min 1Q Median | ||
+ | -0.2431 -0.1125 -0.0286 | ||
- | # import test score data " | + | Coefficients: |
- | tests <- read.csv(" | + | |
- | colnames(tests) <- c(" | + | (Intercept) 0.233102 |
- | tests <- subset(tests, | + | HSGPA |
- | attach(tests) | + | SATV 0.000151 |
- | cors <- cor(tests) | + | --- |
- | round(cors, | + | Signif. codes: |
- | </ | + | |
- | < | + | Residual standard error: 0.192 on 7 degrees of freedom |
- | sat 1.000 0.875 0.718 | + | Multiple R-squared: |
- | clep 0.875 1.000 0.876 | + | F-statistic: |
- | gpa 0.718 0.876 1.000 | + | |
+ | > | ||
> </ | > </ | ||
- | ===== regression gpa against sat ===== | + | [{{: |
+ | SATV 는 significant한 역할을 하지 못한다는 t-test 결과이다. 이것이 사실일까? | ||
+ | 우선 FGPA에 대해 SATV와 HSGPA를 가지고 regression을 각각 해보자 | ||
+ | 아래의 결과는 두개의 IV는 각각 종속변인인 FGPA를 significant하게 설명하고 있다. | ||
+ | 즉, 둘이 같이 설명하려고 했을 때에만 그 설명력이 사라진다. | ||
+ | <WRAP clear/> | ||
+ | < | ||
+ | attach(scholar) | ||
+ | ma1 <- lm(FGPA ~ SATV) | ||
+ | ma2 <- lm(FGPA ~ HSGPA) | ||
+ | summary(ma1) | ||
+ | summary(ma2) | ||
+ | </ | ||
- | < | + | < |
- | > summary(lm.gpa.sat) | + | > ma2 <- lm(FGPA ~ HSGPA) |
+ | > summary(ma1) | ||
Call: | Call: | ||
- | lm(formula = gpa ~ sat, data = tests) | + | lm(formula = FGPA ~ SATV) |
Residuals: | Residuals: | ||
- | Min | + | |
- | -0.23544 -0.12184 | + | -0.2804 -0.1305 -0.0566 0.0350 0.6481 |
Coefficients: | Coefficients: | ||
- | Estimate Std. Error t value Pr(> | + | |
- | (Intercept) | + | (Intercept) |
- | sat 0.0024557 | + | SATV 0.00381 |
--- | --- | ||
Signif. codes: | Signif. codes: | ||
- | Residual standard error: 0.2365 on 8 degrees of freedom | + | Residual standard error: 0.27 on 8 degrees of freedom |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
- | > | + | > summary(ma2) |
- | > </ | + | |
- | linear model | + | Call: |
- | '' | + | lm(formula |
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -0.2434 -0.1094 -0.0266 | ||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | HSGPA 0.872 0.129 6.77 0.00014 *** | ||
+ | --- | ||
+ | Signif. codes: | ||
- | {{lm.gpa.sat.png? | + | Residual standard error: 0.179 on 8 degrees of freedom |
- | Note that .718 = correlation coefficient of sat and gpa. | + | Multiple R-squared: |
- | < | + | F-statistic: |
- | [1] 0.7180529</ | + | |
- | Collect | ||
- | - sat, | ||
- | - gpa, | ||
- | - predicted value (y hat), | ||
- | - residuals (error) | ||
- | And see correlation among themselves. | ||
- | < | ||
- | > cor.gpa.sat <- as.data.frame(cbind(sat, | ||
- | > colnames(cor.gpa.sat) <- c(" | ||
- | > round(cor.gpa.sat, | ||
- | sat gpa pred resid | ||
- | 1 500 2.8 3.01266 -0.21266 | ||
- | 2 550 3.0 3.13544 -0.13544 | ||
- | 3 450 2.9 2.88987 | ||
- | 4 400 2.8 2.76709 | ||
- | 5 600 3.3 3.25823 | ||
- | 6 650 3.3 3.38101 -0.08101 | ||
- | 7 700 3.5 3.50380 -0.00380 | ||
- | 8 550 3.7 3.13544 | ||
- | 9 650 3.4 3.38101 | ||
- | 10 550 2.9 3.13544 -0.23544 | ||
- | > | ||
- | > round(cor(cor.gpa.sat), | ||
- | sat | ||
- | sat 1.000 0.718 1.000 0.000 | ||
- | gpa 0.718 1.000 0.718 0.696 | ||
- | pred 1.000 0.718 1.000 0.000 | ||
- | resid 0.000 0.696 0.000 1.000 | ||
> | > | ||
- | > </ | ||
- | Note that | ||
- | * r (sat and gpa) = .718 | ||
- | * r (sat and pred) = 1. In other words, predicted values (y hats) are the linear function of x (sat) values ('' | ||
- | * r (sat and resid) = 0. residuals are orthogonal to the independent (sat) values. | ||
- | ===== regression gpa against clep ===== | ||
- | < | ||
- | tests <- read.csv(" | ||
- | colnames(tests) <- c(" | ||
- | tests <- subset(tests, | ||
- | attach(tests) | ||
</ | </ | ||
+ | |||
+ | 아래는 HSGPA의 영향력을 IV, DV 모두에게서 제거한 후, DV에 대한 IV의 영향력을 보는 작업이다. | ||
< | < | ||
- | lm.gpa.clep <- lm(gpa ~ clep, data = tests) | + | m1 <- lm(FGPA ~ HSGPA) |
- | summary(lm.gpa.clep) | + | m2 <- lm(SATV ~ HSGPA) |
+ | res.m1 <- resid(m1) | ||
+ | # res.m1 <- m1$residuals | ||
+ | res.m2 <- resid(m2) | ||
+ | m.12 <- lm(res.m1 | ||
+ | summary(m.12) | ||
</ | </ | ||
- | < | + | |
- | lm(formula = gpa ~ clep, data = tests) | + | 결과는 아래에서 보는 것처럼 not significant하다. |
+ | |||
+ | < | ||
+ | |||
+ | > m1 <- lm(FGPA ~ HSGPA) | ||
+ | > m2 <- lm(SATV ~ HSGPA) | ||
+ | > res.m1 <- resid(m1) | ||
+ | > res.m2 <- resid(m2) | ||
+ | > m.12 <- lm(res.m1 ~ res.m2) | ||
+ | > summary(m.12) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = res.m1 | ||
Residuals: | Residuals: | ||
- | | + | |
- | -0.190496 | + | -0.2431 -0.1125 -0.0286 0.1269 0.2716 |
Coefficients: | Coefficients: | ||
- | | + | Estimate Std. Error t value Pr(> |
- | (Intercept) | + | (Intercept) |
- | clep 0.06054 0.01177 | + | res.m2 1.51e-04 |
- | --- | + | |
- | Signif. codes: | + | |
- | 0 ‘***’ | + | |
- | Residual standard error: 0.1637 on 8 degrees of freedom | + | Residual standard error: 0.179 on 8 degrees of freedom |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
- | < | ||
- | # get residuals | ||
- | res.lm.gpa.clep <- lm.gpa.clep$residuals | ||
</ | </ | ||
- | {{lm.gpa.clep.png?500}} | + | 특히 위에서 R-squared value는 0.00165이고 이를 square root 한 값은 $\sqrt{0.00165} = 0.04064$ 이다. 이 값은 HSGPA의 영향력을 IV와 (SATV) DV에서 (FGPA) 모두 제거한 후의 correlation값이라고도 할 수 있다. 사실 이 숫자는 '' |
+ | |||
+ | < | ||
+ | ## 혹은 | ||
+ | cor.test(res.m1, | ||
< | < | ||
- | # get cor between gpa, sat, pred, and resid from. lm.gpa.clep | + | > cor(res.m1, res.m2) |
- | cor.gpa.clep <- as.data.frame(cbind(gpa, | + | [1] 0.04064 |
- | colnames(cor.gpa.clep) <- c(" | + | > |
- | cor(cor.gpa.clep) | + | > cor.test(res.m1, res.m2) |
+ | |||
+ | Pearson' | ||
+ | |||
+ | data: res.m1 and res.m2 | ||
+ | t = 0.12, df = 8, p-value = 0.9 | ||
+ | alternative hypothesis: true correlation is not equal to 0 | ||
+ | 95 percent confidence interval: | ||
+ | -0.6045 0.6535 | ||
+ | sample estimates: | ||
+ | | ||
+ | 0.04064 | ||
+ | > | ||
</ | </ | ||
- | < | ||
- | gpa | ||
- | clep 0.8763 1.0000 1.0000 0.0000 | ||
- | pred 0.8763 1.0000 1.0000 0.0000 | ||
- | resid 0.4818 0.0000 0.0000 1.0000 | ||
- | > </ | ||
+ | 우리는 이것을 [[: | ||
- | ===== regression gpa against both celp and sat ===== | ||
< | < | ||
- | lm.gpa.clepsat <- lm(gpa ~ clep + sat, data = tests) | + | # install.packages(" |
- | summary(lm.gpa.clepsat) | + | pcor.test(FGPA, SATV, HSGPA) |
</ | </ | ||
+ | < | ||
+ | estimate p.value statistic | ||
+ | 1 0.04064 | ||
+ | > </ | ||
+ | 위에서 estimate 값인 0.04064가 위의 R-square 값에 square root값을 씌운 값이 된다. 이를 그림으로 나타내 보면 아래와 같다. | ||
+ | |||
+ | [{{: | ||
+ | 반대의 경우도 실행을 해보면 즉, SATV의 영향력을 제어한 후, HSGPA의 영향력만을 볼 때 | ||
< | < | ||
+ | n1 <- lm(FGPA ~ SATV) | ||
+ | n2 <- lm(HSGPA ~ SATV) | ||
+ | res.n1 <- resid(n1) | ||
+ | res.n2 <- resid(n2) | ||
+ | n.12 <- lm(res.n1 ~ res.n2) | ||
+ | summary(n.12) | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | > n2 <- lm(HSGPA ~ SATV) | ||
+ | > res.n1 <- resid(n1) | ||
+ | > res.n2 <- resid(n2) | ||
+ | > n.12 <- lm(res.n1 ~ res.n2) | ||
+ | > summary(n.12) | ||
Call: | Call: | ||
- | lm(formula = gpa ~ clep + sat, data = tests) | + | lm(formula = res.n1 |
Residuals: | Residuals: | ||
- | | + | |
- | -0.197888 | + | -0.2431 -0.1125 -0.0286 0.1269 0.2716 |
Coefficients: | Coefficients: | ||
- | | + | Estimate Std. Error t value Pr(> |
- | (Intercept) | + | (Intercept) |
- | clep 0.0729294 | + | res.n2 |
- | sat -0.0007015 | + | |
--- | --- | ||
- | Signif. codes: | + | Signif. codes: |
- | 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 | + | |
- | Residual standard error: 0.1713 on 7 degrees of freedom | + | Residual standard error: 0.179 on 8 degrees of freedom |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
> | > | ||
</ | </ | ||
- | ===== checking partial cor 1 ===== | + | 이제는 그 R-squared 값이 0.559임을 알게 되었다. 이의 제곱근은 $\sqrt{0.559} |
+ | |||
+ | [{{: | ||
< | < | ||
- | # get res.lm.clep.sat | + | pcor.test(FGPA, HSGPA, SATV) |
- | lm.sat.clep <- lm(sat ~ clep, data = tests) | + | |
- | summary(lm.sat.clep) | + | |
</ | </ | ||
- | < | + | < |
- | Call: | + | estimate p.value statistic |
- | lm(formula = sat ~ clep, data = tests) | + | 1 |
+ | > </ | ||
- | Residuals: | + | <fc # |
- | | + | |
- | -101.860 -19.292 | + | |
- | Coefficients: | ||
- | Estimate Std. Error t value Pr(> | ||
- | (Intercept) | ||
- | clep 17.665 | ||
- | --- | ||
- | Signif. codes: | ||
- | 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 | ||
- | Residual standard error: 48.2 on 8 degrees of freedom | ||
- | Multiple R-squared: | ||
- | F-statistic: | ||
- | > </ | + | 또한 위의 설명은 [[: |
- | {{lm.sat.clep.png?400}} | + | |
+ | 아래의 결과를 살펴보면 anova() 결과 독립변인들의 p value 들과 summary() 에서의 독립변인들의 p value가 다른 이유가 다르다. | ||
< | < | ||
- | res.lm.sat.clep <- lm.sat.clep$residuals | + | # anova()에서의 결과 |
+ | acs_k3 | ||
+ | # summary(lm())에서의 결과 | ||
+ | acs_k3 | ||
</ | </ | ||
+ | 아래는 [[: | ||
< | < | ||
- | install.packages("ppcor") | + | dvar <- read.csv("http:// |
- | library(ppcor) | + | mod <- lm(api00 ~ ell + acs_k3 + avg_ed + meals, data=dvar) |
+ | summary(mod) | ||
+ | anova(mod) | ||
</ | </ | ||
< | < | ||
- | pcor.gpa.sat.clep <- lm(res.lm.gpa.clep | + | > dvar <- read.csv(" |
- | summary(pcor.gpa.sat.clep) | + | > mod <- lm(api00 ~ ell + acs_k3 + avg_ed + meals, data=dvar) |
- | </ | + | > summary(mod) |
- | < | + | |
Call: | Call: | ||
- | lm(formula = res.lm.gpa.clep | + | lm(formula = api00 ~ ell + acs_k3 + avg_ed + meals, data = dvar) |
Residuals: | Residuals: | ||
- | | + | Min |
- | -0.197888 | + | -187.020 |
Coefficients: | Coefficients: | ||
- | | + | |
- | (Intercept) | + | (Intercept) |
- | res.lm.sat.clep -7.015e-04 1.175e-03 -0.597 | + | ell -0.8434 |
+ | acs_k3 | ||
+ | avg_ed | ||
+ | meals -2.9374 | ||
+ | --- | ||
+ | Signif. codes: | ||
- | Residual standard error: | + | Residual standard error: |
- | Multiple R-squared: | + | (21 observations deleted due to missingness) |
- | F-statistic: | + | Multiple R-squared: |
+ | F-statistic: | ||
- | </code> | + | > anova(mod) |
+ | Analysis of Variance Table | ||
- | {{pcor.gpa.sat.clep.png? | + | Response: api00 |
- | + | Df Sum Sq Mean Sq F value Pr(> | |
- | <code> | + | ell 1 4502711 4502711 1309.762 |
- | > pcor.gpa.sat.clep <- pcor.test(gpa, | + | acs_k3 |
- | > pcor.gpa.sat.clep | + | avg_ed |
- | | + | meals |
- | 1 -0.2064849 | + | Residuals 374 1285740 |
- | > pcor.gpa.sat.clep$estimate^2 | + | --- |
- | [1] 0.04263601 | + | Signif. codes: |
> | > | ||
</ | </ | ||
- | < | ||
- | plot(d)</ | ||
- | {{pcor.sat.clep.gpa.res.png}} | + | 아래에서 mod2를 보면 |
+ | * api00이 종속변인이고, | ||
+ | * ell, avg_ed, meals + acs_k3 가 독립변인인데, | ||
+ | * 그 순서가 이전 문서의 | ||
+ | * ell, acs_k3, avg_ed, meals에서 바뀐것을 알 수 있다 (acs.k3가 맨 뒤로 감). | ||
+ | * 즉, | ||
+ | * lm(api00 ~ ell + acs_k3 + avg_ed + meals) | ||
+ | * lm(api00 ~ ell + avg_ed + meals + acs_k3) | ||
- | Note that the relationship between res.lm.gpa.clep and sat look like negative, which can be confirmed in the lm.gpa.satclep < | + | anova는 독립변인에 대한 영향력을 다른 IV들을 고려하지 않고, 그냥 입력 순서대로 처리하므로, acs_k3를 마지막으로 보냄으로써, 다른 IV들이 DV에 대한 설명력을 모두 차지하고 그 나머지를 보여주게 된다. 이것이 t-test가 significant하지 않은 이유이다. |
- | ===== checking partial cor 2 ===== | + | |
- | < | + | |
- | > # import test score data " | + | |
- | > tests <- read.csv(" | + | |
- | > colnames(tests) <- c(" | + | |
- | > tests <- subset(tests, | + | |
- | > attach(tests) | + | |
- | > cor(tests) | + | |
- | | + | |
- | sat 1.0000000 0.8745001 0.7180459 | + | |
- | clep 0.8745001 1.0000000 0.8762720 | + | |
- | gpa 0.7180459 0.8762720 1.0000000 | + | |
- | </ | + | |
- | + | ||
- | $$ | + | |
- | r_{12.3} = \frac {r_{12} | + | |
- | $$ | + | |
- | (1 = GPA, 2 = CLEP, 3 = SAT) | + | |
- | + | ||
- | \begin{eqnarray*} | + | |
- | r_{\text{gpaclep.sat}} & = & \frac {r_{\text{gpaclep}} - r_{\text{gpasat}} r_{\text{clepsat}} } {\sqrt{1-r_{\text{gpasat}}^2} \sqrt{1-r_{\text{clepsat}}^2}} \\ | + | |
- | & = & \frac {0.8762720 - (0.7180459)(0.8745001)}{\sqrt{1-0.7180459^2} \sqrt{1-0.8745001^2}} \\ | + | |
- | & = & .73 | + | |
- | \end{eqnarray*} | + | |
- | $$ | + | < |
- | r_{12.3} = \frac {r_{12} - r_{13} r_{23} } {\sqrt{1-r_{13}^2} \sqrt{1-r_{23}^2}} | + | > summary(mod2) |
- | $$ | + | |
- | (1 = gpa, 2 = sat, 3 = clep) | + | |
- | + | ||
- | \begin{eqnarray*} | + | |
- | r_{\text{gpasat.clep}} & = & \frac {r_{\text{gpasat}} - r_{\text{gpaclep}} r_{\text{satclep}} } {\sqrt{1-r_{\text{gpaclep}}^2} \sqrt{1-r_{\text{satclep}}^2}} \\ | + | |
- | & = & \frac {0.7180459 - (0.8762720)(0.8745001)}{\sqrt{1-0.8762720^2} \sqrt{1-0.8745001^2}} \\ | + | |
- | & = & 0.04263585 | + | |
- | \end{eqnarray*} | + | |
- | + | ||
- | < | + | |
- | | + | |
- | sat 1.0000000 0.8745001 0.7180459 | + | |
- | clep 0.8745001 1.0000000 0.8762720 | + | |
- | gpa 0.7180459 0.8762720 1.0000000 | + | |
- | > round(cor(tests), | + | |
- | sat | + | |
- | sat 1.0000 0.8745 0.7180 | + | |
- | clep 0.8745 1.0000 0.8763 | + | |
- | gpa 0.7180 0.8763 1.0000 | + | |
- | > c<- (.7180459-(.8762720*.8745001)) | + | |
- | > d <- (sqrt(1-.8762720^2) * sqrt(1-.8745001^2)) | + | |
- | > c/d | + | |
- | [1] -0.2064845 | + | |
- | > (c/d)^2 | + | |
- | [1] 0.04263585 | + | |
- | > </ | + | |
- | + | ||
- | ====== Semipartial cor ====== | + | |
- | See also [[: | + | |
- | + | ||
- | < | + | |
- | > colnames(tests) <- c(" | + | |
- | > tests <- subset(tests, | + | |
- | > attach(tests) | + | |
- | > cors <- cor(tests) | + | |
- | > round(cors, | + | |
- | | + | |
- | sat 1.000 0.875 0.718 | + | |
- | clep 0.875 1.000 0.876 | + | |
- | gpa 0.718 0.876 1.000 | + | |
- | > lm.sat.clep | + | |
- | > summary(lm.sat.clep) | + | |
Call: | Call: | ||
- | lm(formula = sat ~ clep, data = tests) | + | lm(formula = api00 ~ ell + avg_ed + meals + acs_k3, data = dvar) |
Residuals: | Residuals: | ||
- | Min | + | |
- | -101.860 -19.292 1.136 | + | -186.90 -40.13 |
Coefficients: | Coefficients: | ||
Estimate Std. Error t value Pr(> | Estimate Std. Error t value Pr(> | ||
- | (Intercept) | + | (Intercept) |
- | clep 17.665 3.464 5.100 0.00093 *** | + | ell -0.845 |
+ | avg_ed | ||
+ | meals | ||
+ | acs_k3 | ||
--- | --- | ||
Signif. codes: | Signif. codes: | ||
- | Residual standard error: | + | Residual standard error: |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
- | > res.lm.sat.clep <- lm.sat.clep$residuals | ||
> | > | ||
- | > install.packages(" | + | > anova(mod2) |
- | > library(ppcor) | + | Analysis of Variance Table |
- | Loading required package: MASS | + | |
- | > # regression test for semipartial correlation | + | Response: api00 |
- | > spcor.gpa.sat.clep <- lm(gpa ~ res.lm.sat.clep) | + | |
- | > summary(spcor.gpa.sat.clep) | + | ell 1 4502711 4502711 1309.762 <2e-16 *** |
+ | avg_ed | ||
+ | meals | ||
+ | acs_k3 | ||
+ | Residuals 374 1285740 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | </ | ||
+ | |||
+ | 이는 다른 독립변인들의 순서를 바꾸어도 마찬가지이다. mod3은 mod2에서 meals변인을 맨 앞으로 옮긴 예이다. 즉 | ||
+ | |||
+ | * mod | ||
+ | * mod2 <- lm(api00 ~ ell + avg_ed + meals + acs_k3) | ||
+ | * mod3 <- lm(api00 ~ meals + ell + avg_ed + acs_k3) | ||
+ | |||
+ | summary(mod), | ||
+ | < | ||
+ | > mod3 <- lm(api00 ~ meals + ell + avg_ed + acs_k3, data=dvar) | ||
+ | > summary(mod3) | ||
Call: | Call: | ||
- | lm(formula = gpa ~ res.lm.sat.clep) | + | lm(formula = api00 ~ meals + ell + avg_ed + acs_k3, data = dvar) |
Residuals: | Residuals: | ||
Min 1Q Median | Min 1Q Median | ||
- | -0.3756 -0.2694 -0.0092 0.2514 0.4686 | + | -186.90 |
Coefficients: | Coefficients: | ||
- | | + | |
- | (Intercept) | + | (Intercept) |
- | res.lm.sat.clep -0.0007015 | + | meals -2.948 0.196 -15.02 < 2e-16 *** |
+ | ell -0.845 | ||
+ | avg_ed | ||
+ | acs_k3 | ||
--- | --- | ||
Signif. codes: | Signif. codes: | ||
- | Residual standard error: | + | Residual standard error: |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
- | </code> | + | |
- | From the above: Multiple R-squared: 0.009898 | + | > anova(mod2) |
- | From the below: spcor.gpa.sat.clep%%$%%estimate^2: | + | Analysis of Variance Table |
- | <code>> spcor.gpa.sat.clep <- spcor.test(gpa, | + | Response: api00 |
- | > spcor.gpa.sat.clep | + | |
- | estimate | + | ell 1 4480281 4480281 1297.34 <2e-16 *** |
- | 1 -0.09948786 | + | avg_ed |
- | > spcor.gpa.sat.clep$estimate^2 | + | meals |
- | [1] 0.009897835 | + | acs_k3 |
+ | Residuals 371 1281224 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | > | ||
+ | > | ||
+ | > anova(mod3) | ||
+ | Analysis of Variance Table | ||
+ | |||
+ | Response: api00 | ||
+ | | ||
+ | meals 1 6219897 6219897 1801.08 < 2e-16 *** | ||
+ | ell | ||
+ | avg_ed | ||
+ | acs_k3 | ||
+ | Residuals 371 1281224 | ||
+ | --- | ||
+ | Signif. codes: | ||
> </ | > </ | ||
+ | ====== e.g. Using ppcor.test with 4 var ====== | ||
- | ====== e.g., ====== | + | < |
+ | options(digits = 4) | ||
+ | |||
+ | HSGPA <- c(3.0, 3.2, 2.8, 2.5, 3.2, 3.8, 3.9, 3.8, 3.5, 3.1) | ||
+ | FGPA <- c(2.8, 3.0, 2.8, 2.2, 3.3, 3.3, 3.5, 3.7, 3.4, 2.9) | ||
+ | SATV <- c(500, 550, 450, 400, 600, 650, 700, 550, 650, 550) | ||
+ | GREV <- c(600, 670, 540, 800, 750, 820, 830, 670, 690, 600) | ||
+ | ##GREV <- c(510, 670, 440, 800, 750, 420, 830, 470, 690, 600) | ||
+ | |||
+ | scholar <- data.frame(HSGPA, | ||
+ | describe(scholar) # provides descrptive information about each variable | ||
+ | |||
+ | corrs <- cor(scholar) # find the correlations and set them into an object called ' | ||
+ | corrs # print corrs | ||
+ | |||
+ | pairs(scholar) | ||
+ | |||
+ | pcor.test(scholar$GREV, | ||
+ | |||
+ | reg3 <- lm(GREV ~ SATV + HSGPA) | ||
+ | resid3 <- resid(reg3) | ||
+ | |||
+ | reg4 <- lm(FGPA ~ SATV + HSGPA) | ||
+ | resid4 <- resid(reg4) | ||
+ | |||
+ | cor(resid3, resid4) | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | > pcor.test(scholar$GREV, | ||
+ | estimate p.value statistic | ||
+ | 1 | ||
+ | > | ||
+ | > reg3 <- lm(GREV ~ SATV + HSGPA) | ||
+ | > resid3 <- resid(reg3) | ||
+ | > | ||
+ | > reg4 <- lm(FGPA ~ SATV + HSGPA) | ||
+ | > resid4 <- resid(reg4) | ||
+ | > | ||
+ | > cor(resid3, resid4) | ||
+ | [1] -0.535 | ||
+ | |||
+ | |||
+ | </ | ||
+ | ---- | ||
+ | ---- | ||
+ | 학자인 A는 GRE점수는 (GREV) 학점에 신경을 쓰는 활동보다는 지능지수와 관련된다고 믿는 SATV의 영향력이 더 클것으로 생각된다. 그래서 SATV만의 영향력을 다른 변인을 콘트롤하여 살펴보고 싶다. | ||
+ | < | ||
+ | pcor.test(scholar$GREV, | ||
+ | |||
+ | reg7 <- lm(GREV ~ HSGPA + FGPA) # run linear regression | ||
+ | resid7 <- resid(reg7) | ||
+ | |||
+ | reg8 <- lm(SATV ~ HSGPA+ FGPA) # second regression | ||
+ | resid8 <- resid(reg8) | ||
+ | |||
+ | cor(resid7, resid8) | ||
+ | |||
+ | </ | ||
+ | |||
+ | < | ||
+ | > pcor.test(scholar$GREV, | ||
+ | estimate p.value statistic | ||
+ | 1 | ||
+ | > | ||
+ | > reg7 <- lm(GREV ~ HSGPA + FGPA) # run linear regression | ||
+ | > resid7 <- resid(reg7) | ||
+ | > | ||
+ | > reg8 <- lm(SATV ~ HSGPA+ FGPA) # second regression | ||
+ | > resid8 <- resid(reg8) | ||
+ | > | ||
+ | > cor(resid7, resid8) | ||
+ | [1] 0.3179 | ||
+ | > | ||
+ | > | ||
+ | </ | ||
+ | |||
+ | ====== e.g., 독립변인 들이 서로 독립적일 때의 각각의 설명력 | ||
In this example, the two IVs are orthogonal to each other (not correlated with each other). Hence, regress res.y.x2 against x1 would not result in any problem. | In this example, the two IVs are orthogonal to each other (not correlated with each other). Hence, regress res.y.x2 against x1 would not result in any problem. | ||
< | < | ||
Line 522: | Line 601: | ||
</ | </ | ||
< | < | ||
- | d <- data.frame(X1=x1, | + | d <- data.frame(X1=x1, |
plot(d) | plot(d) | ||
</ | </ | ||
- | {{partial.r.regression.png}} | + | {{: |
< | < | ||
- | [1] 0.0073</ | + | [1] 0.0073 |
+ | </ | ||
X2이 설명하는 Y분산의 나머지를 (1-R< | X2이 설명하는 Y분산의 나머지를 (1-R< | ||
Line 563: | Line 644: | ||
x2의 영향력을 control한 후에 x1영향력을 보면 64.54%에 달하게 된다. | x2의 영향력을 control한 후에 x1영향력을 보면 64.54%에 달하게 된다. | ||
+ | ====== Why overall model is significant while IVs are not? ====== | ||
+ | see https:// | ||
- | ====== e.g. ====== | + | < |
- | < | + | RSS = 3:10 #Right shoe size |
- | y x1 x2 | + | LSS = rnorm(RSS, RSS, 0.1) #Left shoe size - similar to RSS |
- | 1.644540 1.063845 .351188 | + | cor(LSS, RSS) # |
- | 1.785204 1.203146 .200000 | + | |
- | -1.36357 -.466514 -.961069 | + | weights = 120 + rnorm(RSS, 10*RSS, 10) |
- | .314549 1.175054 .800000 | + | |
- | .317955 .100612 .858597 | + | ##Fit a joint model |
- | .970097 2.438904 1.000000 | + | m = lm(weights ~ LSS + RSS) |
- | .664388 1.204048 .292670 | + | |
- | -.870252 -.993857 -1.89018 | + | ##F-value is very small, but neither LSS or RSS are significant |
- | 1.962192 .587540 -.275352 | + | summary(m) |
- | 1.036381 -.110834 -.246448 | + | |
- | .007415 -.069234 1.447422 | + | |
- | 1.634353 .965370 .467095 | + | |
- | .219813 .553268 .348095 | + | |
- | -.285774 .358621 .166708 | + | |
- | 1.498758 -2.87971 -1.13757 | + | |
- | 1.671538 -.310708 .396034 | + | |
- | 1.462036 .057677 1.401522 | + | |
- | -.563266 .904716 -.744522 | + | |
- | .297874 .561898 -.929709 | + | |
- | -1.54898 -.898084 -.838295 | + | |
</ | </ | ||
- | < | + | |
- | y x1 x2 | + | < |
- | 1.644540 1.063845 .351188 | + | > LSS = rnorm(RSS, RSS, 0.1) #Left shoe size - similar to RSS |
- | 1.785204 -1.20315 .200000 | + | > cor(LSS, RSS) # |
- | -1.36357 -.466514 -.961069 | + | [1] 0.9994836 |
- | .314549 1.175054 .800000 | + | > |
- | .317955 -.100612 .858597 | + | > weights = 120 + rnorm(RSS, 10*RSS, 10) |
- | .970097 1.438904 1.000000 | + | > |
- | .664388 1.204048 .292670 | + | > ##Fit a joint model |
- | -.870252 -.993857 -1.89018 | + | > m = lm(weights ~ LSS + RSS) |
- | 1.962192 -.587540 -.275352 | + | > |
- | 1.036381 -.110834 -.246448 | + | > ##F-value is very small, but neither LSS or RSS are significant |
- | .007415 -.069234 1.447422 | + | > summary(m) |
- | 1.634353 .965370 .467095 | + | |
- | .219813 .553268 .348095 | + | Call: |
- | -.285774 .358621 .166708 | + | lm(formula = weights ~ LSS + RSS) |
- | 1.498758 -2.87971 -1.13757 | + | |
- | 1.671538 -.810708 .396034 | + | Residuals: |
- | 1.462036 -.057677 1.401522 | + | |
- | -.563266 .904716 -.744522 | + | 4.8544 4.5254 -3.6333 -7.6402 -0.2467 -3.1997 -5.2665 10.6066 |
- | .297874 .561898 -.929709 | + | |
- | -1.54898 -1.26108 -.838295 | + | Coefficients: |
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | LSS -14.162 35.447 -0.400 0.706 | ||
+ | RSS 26.305 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 7.296 on 5 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: 59.92 on 2 and 5 DF, p-value: 0.000321 | ||
+ | |||
+ | > | ||
+ | > ##Fitting RSS or LSS separately gives a significant result. | ||
+ | > summary(lm(weights ~ LSS)) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = weights ~ LSS) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -6.055 -4.930 -2.925 4.886 11.854 | ||
+ | |||
+ | Coefficients: | ||
+ | | ||
+ | (Intercept) | ||
+ | LSS 12.440 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 7.026 on 6 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: 128.6 on 1 and 6 DF, p-value: 2.814e-05 | ||
+ | |||
+ | > | ||
</ | </ | ||
- | ====== e.g. 3, ====== | + | |
+ | |||
+ | |||
+ | |||
+ | |||
+ | ====== e. g. API00 data using ppcor package | ||
+ | This part is an extension of the [[:multiple regression# | ||
+ | |||
+ | We are going to use spcor for identifying the effect of each IV. In order to do that, we need to reconstruct the data with the involved variable. | ||
< | < | ||
- | set.seed(888) # for reproducibility | + | dvar <- read.csv(" |
- | S = rnorm(60, mean=0, sd=1.0) # the Suppressor is normally distributed | + | mod <- lm(api00 ~ ell + acs_k3 + avg_ed + meals, data=dvar) |
- | U = 1.1*S + rnorm(60, mean=0, sd=0.1) # U (unrelated) is Suppressor plus error | + | summary(mod) |
- | R <- rnorm(60, mean=0, sd=1.0) # related part; normally distributed | + | anova(mod) |
- | OV = U + R # the Other Variable is U plus R | + | |
- | Y = R + rnorm(60, mean=0, sd=2) # Y is R plus error | + | attach(dvar) |
+ | da1 <- data.frame(api00, ell, acs_k3, avg_ed, meals) | ||
+ | da2 <- data.frame(api00, ell, avg_ed, meals) | ||
+ | |||
+ | da1 <- na.omit(da1) | ||
+ | da2 <- na.omit(da2) | ||
+ | |||
+ | spcor(da1) | ||
+ | spcor(da2) | ||
</ | </ | ||
+ | |||
+ | < | ||
+ | > spcor(da1) | ||
+ | $estimate | ||
+ | | ||
+ | api00 | ||
+ | ell -0.13469956 | ||
+ | acs_k3 | ||
+ | avg_ed | ||
+ | meals -0.29972194 | ||
+ | |||
+ | $p.value | ||
+ | api00 ell acs_k3 | ||
+ | api00 0.000000e+00 0.07761805 0.5525340 0.085390280 2.403284e-10 | ||
+ | ell 8.918743e-03 0.00000000 0.2390272 0.232377348 1.558141e-03 | ||
+ | acs_k3 1.608778e-01 0.05998819 0.0000000 0.009891503 7.907183e-03 | ||
+ | avg_ed 1.912418e-02 0.27203887 0.1380449 0.000000000 7.424903e-05 | ||
+ | meals 3.041658e-09 0.04526574 0.2919775 0.006489783 0.000000e+00 | ||
+ | |||
+ | $statistic | ||
+ | | ||
+ | api00 | ||
+ | ell -2.628924 | ||
+ | acs_k3 | ||
+ | avg_ed | ||
+ | meals -6.075665 | ||
+ | |||
+ | $n | ||
+ | [1] 379 | ||
+ | |||
+ | $gp | ||
+ | [1] 3 | ||
+ | |||
+ | $method | ||
+ | [1] " | ||
+ | |||
+ | > spcor(da2) | ||
+ | $estimate | ||
+ | api00 | ||
+ | api00 | ||
+ | ell -0.1331295 | ||
+ | avg_ed | ||
+ | meals -0.3170300 | ||
+ | |||
+ | $p.value | ||
+ | api00 ell avg_ed | ||
+ | api00 0.000000e+00 0.08053412 0.099035570 2.160270e-11 | ||
+ | ell 9.465897e-03 0.00000000 0.172705169 2.366624e-03 | ||
+ | avg_ed 2.340012e-02 0.20663211 0.000000000 1.158869e-04 | ||
+ | meals 2.698723e-10 0.05296417 0.008170237 0.000000e+00 | ||
+ | |||
+ | $statistic | ||
+ | | ||
+ | api00 | ||
+ | ell -2.608123 | ||
+ | avg_ed | ||
+ | meals -6.490414 | ||
+ | |||
+ | $n | ||
+ | [1] 381 | ||
+ | |||
+ | $gp | ||
+ | [1] 2 | ||
+ | |||
+ | $method | ||
+ | [1] " | ||
+ | |||
+ | > | ||
+ | </ | ||
+ | ====== e.g. mpg model in mtcars (in r) ====== | ||
+ | {{: |
partial_and_semipartial_correlation.txt · Last modified: 2024/06/12 08:01 by hkimscil