partial_and_semipartial_correlation
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
partial_and_semipartial_correlation [2019/10/21 00:53] – [regression gpa against sat] hkimscil | partial_and_semipartial_correlation [2024/06/12 08:01] (current) – [IV 간의 correlation이 심할 때 Regression 결과의 오류] hkimscil | ||
---|---|---|---|
Line 2: | Line 2: | ||
references | references | ||
{{https:// | {{https:// | ||
+ | or [[https:// | ||
+ | ====== Partial and semipartial ====== | ||
+ | < | ||
+ | options(digits = 4) | ||
+ | HSGPA <- c(3.0, 3.2, 2.8, 2.5, 3.2, 3.8, 3.9, 3.8, 3.5, 3.1) | ||
+ | FGPA <- c(2.8, 3.0, 2.8, 2.2, 3.3, 3.3, 3.5, 3.7, 3.4, 2.9) | ||
+ | SATV <- c(500, 550, 450, 400, 600, 650, 700, 550, 650, 550) | ||
+ | scholar <- data.frame(FGPA, | ||
+ | describe(scholar) # provides descrptive information about each variable | ||
- | Simple explanation of the below procedures is like this: | + | corrs <- cor(scholar) # find the correlations |
- | * Separately regress Y and X1 against X2, that is, | + | corrs # print corrs |
- | * regress Y against X2 AND | + | |
- | * regress X1 against X2. | + | |
- | * Regress the Y residuals against the X1 residuals. | + | |
- | In the below example, | + | |
- | * regress gpa against sat | + | |
- | * regress clep against sat | + | |
- | * regress the gpa residuals against clep residuals. | + | |
- | Take a close look at the graphs, especially, the grey areas. | + | |
- | For more, see https:// | + | pairs(scholar) |
+ | </code> | ||
- | - sat, clep이 각각 | + | < |
- | | + | attach(scholar) |
- | - lm.gpa.clepsat 를 구해서 sat의 영향력이 사라지는 것을 본다. | + | # freshman' |
- | - < | + | mod.all <- lm(FGPA ~ HSGPA+ SATV, data = scholar) |
- | - sat에는 clep과 관련이 있는 | + | summary(mod.all) |
- | - 만약에 독립변인인 sat와 clep이 orthogonal하다면 (즉, 상관관계가 0이라면), 스트라이크 아웃된 부분이 가능하겠지만 그렇지 않기에 sat에서 clep의 부분을 제거한 부분을 구해서 즉, res.lm.sat.clep을 구해서 res.lm.gpa.clep와의 상관관계를 본다. | + | </ |
+ | < | ||
+ | > summary(mod.all) | ||
- | ====== Partial cor ====== | + | Call: |
- | please refer to the page: http:// | + | lm(formula |
- | {{: | + | |
- | | Person | + | |
- | | 1 | 500 | 30 | 2.8 | | + | |
- | | 2 | 550 | 32 | 3 | | + | |
- | | 3 | 450 | 28 | 2.9 | | + | |
- | | 4 | 400 | 25 | 2.8 | | + | |
- | | 5 | 600 | 32 | 3.3 | | + | |
- | | 6 | 650 | 38 | 3.3 | | + | |
- | | 7 | 700 | 39 | 3.5 | | + | |
- | | 8 | 550 | 38 | 3.7 | | + | |
- | | 9 | 650 | 35 | 3.4 | | + | |
- | | 10 | 550 | 31 | 2.9 | | + | |
- | < | + | Residuals: |
+ | Min 1Q Median | ||
+ | -0.2431 -0.1125 -0.0286 | ||
- | # import test score data " | + | Coefficients: |
- | tests <- read.csv(" | + | |
- | colnames(tests) <- c(" | + | (Intercept) 0.233102 |
- | tests <- subset(tests, | + | HSGPA |
- | attach(tests) | + | SATV 0.000151 |
- | cors <- cor(tests) | + | --- |
- | round(cors, | + | Signif. codes: |
- | </ | + | |
- | < | + | Residual standard error: 0.192 on 7 degrees of freedom |
- | sat 1.000 0.875 0.718 | + | Multiple R-squared: |
- | clep 0.875 1.000 0.876 | + | F-statistic: |
- | gpa 0.718 0.876 1.000 | + | |
+ | > | ||
> </ | > </ | ||
- | ===== regression gpa against sat ===== | + | [{{: |
+ | SATV 는 significant한 역할을 하지 못한다는 t-test 결과이다. 이것이 사실일까? | ||
+ | 우선 FGPA에 대해 SATV와 HSGPA를 가지고 regression을 각각 해보자 | ||
+ | 아래의 결과는 두개의 IV는 각각 종속변인인 FGPA를 significant하게 설명하고 있다. | ||
+ | 즉, 둘이 같이 설명하려고 했을 때에만 그 설명력이 사라진다. | ||
+ | <WRAP clear/> | ||
+ | < | ||
+ | attach(scholar) | ||
+ | ma1 <- lm(FGPA ~ SATV) | ||
+ | ma2 <- lm(FGPA ~ HSGPA) | ||
+ | summary(ma1) | ||
+ | summary(ma2) | ||
+ | </ | ||
- | < | + | < |
- | > summary(lm.gpa.sat) | + | > ma2 <- lm(FGPA ~ HSGPA) |
+ | > summary(ma1) | ||
Call: | Call: | ||
- | lm(formula = gpa ~ sat, data = tests) | + | lm(formula = FGPA ~ SATV) |
Residuals: | Residuals: | ||
- | Min | + | |
- | -0.23544 -0.12184 | + | -0.2804 -0.1305 -0.0566 0.0350 0.6481 |
Coefficients: | Coefficients: | ||
- | Estimate Std. Error t value Pr(> | + | |
- | (Intercept) | + | (Intercept) |
- | sat 0.0024557 | + | SATV 0.00381 |
--- | --- | ||
Signif. codes: | Signif. codes: | ||
- | Residual standard error: 0.2365 on 8 degrees of freedom | + | Residual standard error: 0.27 on 8 degrees of freedom |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
- | > | + | > summary(ma2) |
- | > </ | + | |
- | linear model | + | Call: |
- | '' | + | lm(formula |
- | '' | + | |
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -0.2434 -0.1094 -0.0266 | ||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | HSGPA 0.872 0.129 6.77 0.00014 *** | ||
+ | --- | ||
+ | Signif. codes: | ||
- | {{lm.gpa.sat.png? | + | Residual standard error: 0.179 on 8 degrees of freedom |
- | Note that .718 = correlation coefficient of sat and gpa. | + | Multiple R-squared: |
- | < | + | F-statistic: |
- | [1] 0.7180529</ | + | |
- | Collect | ||
- | - sat, | ||
- | - gpa, | ||
- | - predicted value (y hat), | ||
- | - residuals (error) | ||
- | And see correlation among themselves. | ||
- | < | ||
- | > cor.gpa.sat <- as.data.frame(cbind(sat, | ||
- | > colnames(cor.gpa.sat) <- c(" | ||
- | > round(cor.gpa.sat, | ||
- | sat gpa pred resid | ||
- | 1 500 2.8 3.01266 -0.21266 | ||
- | 2 550 3.0 3.13544 -0.13544 | ||
- | 3 450 2.9 2.88987 | ||
- | 4 400 2.8 2.76709 | ||
- | 5 600 3.3 3.25823 | ||
- | 6 650 3.3 3.38101 -0.08101 | ||
- | 7 700 3.5 3.50380 -0.00380 | ||
- | 8 550 3.7 3.13544 | ||
- | 9 650 3.4 3.38101 | ||
- | 10 550 2.9 3.13544 -0.23544 | ||
- | > | ||
- | round(cor(cor.gpa.sat), | ||
- | sat | ||
- | sat 1.000 0.718 1.000 0.000 | ||
- | gpa 0.718 1.000 0.718 0.696 | ||
- | pred 1.000 0.718 1.000 0.000 | ||
- | resid 0.000 0.696 0.000 1.000 | ||
> | > | ||
</ | </ | ||
- | Note that | + | |
- | * r (sat and gpa) = .718 (sqrt(r< | + | 아래는 HSGPA의 영향력을 IV, DV 모두에게서 제거한 후, DV에 대한 IV의 영향력을 보는 작업이다. |
- | * r (sat and pred) = 1. In other words, predicted values (y hats) are the linear function of x (sat) values ('' | + | |
- | * r (sat and resid) = 0. residuals are orthogonal to the independent (sat) values. | + | |
- | ===== regression gpa against clep ===== | + | |
- | < | + | |
- | tests <- read.csv(" | + | |
- | colnames(tests) <- c(" | + | |
- | tests <- subset(tests, | + | |
- | attach(tests) | + | |
- | </ | + | |
< | < | ||
- | lm.gpa.clep <- lm(gpa ~ clep, data = tests) | + | m1 <- lm(FGPA ~ HSGPA) |
- | summary(lm.gpa.clep) | + | m2 <- lm(SATV ~ HSGPA) |
+ | res.m1 <- resid(m1) | ||
+ | # res.m1 <- m1$residuals | ||
+ | res.m2 <- resid(m2) | ||
+ | m.12 <- lm(res.m1 | ||
+ | summary(m.12) | ||
</ | </ | ||
- | < | ||
- | lm(formula = gpa ~ clep, data = tests) | ||
- | Residuals: | + | 결과는 아래에서 보는 것처럼 not significant하다. |
- | Min 1Q Median | + | |
- | -0.190496 -0.141167 -0.002376 | + | |
- | Coefficients: | + | <code> |
- | Estimate Std. Error t value Pr(>|t|) | + | |
- | (Intercept) | + | |
- | clep | + | |
- | --- | + | |
- | Signif. codes: | + | |
- | 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 | + | |
- | Residual standard error: 0.1637 on 8 degrees of freedom | + | > m1 <- lm(FGPA ~ HSGPA) |
- | Multiple R-squared: | + | > m2 <- lm(SATV ~ HSGPA) |
- | F-statistic: 26.46 on 1 and 8 DF, p-value: 0.0008808 | + | > res.m1 <- resid(m1) |
- | </code> | + | > res.m2 <- resid(m2) |
+ | > m.12 <- lm(res.m1 ~ res.m2) | ||
+ | > summary(m.12) | ||
- | '' | + | Call: |
+ | lm(formula | ||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -0.2431 -0.1125 -0.0286 | ||
+ | Coefficients: | ||
+ | | ||
+ | (Intercept) -6.50e-18 | ||
+ | res.m2 | ||
+ | |||
+ | Residual standard error: 0.179 on 8 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
- | < | ||
- | # get residuals | ||
- | res.lm.gpa.clep <- lm.gpa.clep$residuals | ||
</ | </ | ||
+ | 특히 위에서 R-squared value는 0.00165이고 이를 square root 한 값은 $\sqrt{0.00165} = 0.04064$ 이다. 이 값은 HSGPA의 영향력을 IV와 (SATV) DV에서 (FGPA) 모두 제거한 후의 correlation값이라고도 할 수 있다. 사실 이 숫자는 '' | ||
- | {{lm.gpa.clep.png?500}} | + | < |
+ | ## 혹은 | ||
+ | cor.test(res.m1, | ||
< | < | ||
- | # get cor between gpa, sat, pred, and resid from. lm.gpa.clep | + | > cor(res.m1, res.m2) |
- | cor.gpa.clep <- as.data.frame(cbind(clep, gpa, lm.gpa.clep$fitted.values, lm.gpa.clep$residuals)) | + | [1] 0.04064 |
- | colnames(cor.gpa.clep) <- c(" | + | |
- | cor(cor.gpa.clep) | + | |
- | </ | + | |
- | < | + | |
- | > round(cor(cor.gpa.clep), | + | |
- | clep gpa | + | |
- | clep | + | |
- | gpa | + | |
- | pred 1.0000 0.8763 1.0000 0.0000 | + | |
- | resid 0.0000 0.4818 0.0000 1.0000 | + | |
> | > | ||
+ | > cor.test(res.m1, | ||
+ | |||
+ | Pearson' | ||
- | sat | + | data: res.m1 and res.m2 |
- | sat 1.0000 0.7180 1.0000 0.0000 | + | t = 0.12, df = 8, p-value = 0.9 |
- | gpa 0.7180 1.0000 | + | alternative hypothesis: true correlation is not equal to 0 |
- | pred 1.0000 | + | 95 percent confidence interval: |
- | resid 0.0000 0.6960 0.0000 1.0000 | + | -0.6045 |
- | > | + | sample estimates: |
+ | cor | ||
+ | 0.04064 | ||
+ | > | ||
</ | </ | ||
+ | 우리는 이것을 [[: | ||
- | ===== regression gpa against both celp and sat ===== | ||
< | < | ||
- | lm.gpa.clepsat <- lm(gpa ~ clep + sat, data = tests) | + | # install.packages(" |
- | summary(lm.gpa.clepsat) | + | pcor.test(FGPA, SATV, HSGPA) |
</ | </ | ||
+ | < | ||
+ | estimate p.value statistic | ||
+ | 1 0.04064 | ||
+ | > </ | ||
+ | 위에서 estimate 값인 0.04064가 위의 R-square 값에 square root값을 씌운 값이 된다. 이를 그림으로 나타내 보면 아래와 같다. | ||
+ | |||
+ | [{{: | ||
+ | 반대의 경우도 실행을 해보면 즉, SATV의 영향력을 제어한 후, HSGPA의 영향력만을 볼 때 | ||
< | < | ||
+ | n1 <- lm(FGPA ~ SATV) | ||
+ | n2 <- lm(HSGPA ~ SATV) | ||
+ | res.n1 <- resid(n1) | ||
+ | res.n2 <- resid(n2) | ||
+ | n.12 <- lm(res.n1 ~ res.n2) | ||
+ | summary(n.12) | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | > n2 <- lm(HSGPA ~ SATV) | ||
+ | > res.n1 <- resid(n1) | ||
+ | > res.n2 <- resid(n2) | ||
+ | > n.12 <- lm(res.n1 ~ res.n2) | ||
+ | > summary(n.12) | ||
Call: | Call: | ||
- | lm(formula = gpa ~ clep + sat, data = tests) | + | lm(formula = res.n1 |
Residuals: | Residuals: | ||
- | | + | |
- | -0.197888 | + | -0.2431 -0.1125 -0.0286 0.1269 0.2716 |
Coefficients: | Coefficients: | ||
- | | + | Estimate Std. Error t value Pr(> |
- | (Intercept) | + | (Intercept) |
- | clep 0.0729294 | + | res.n2 |
- | sat -0.0007015 | + | |
--- | --- | ||
- | Signif. codes: | + | Signif. codes: |
- | 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 | + | |
- | Residual standard error: 0.1713 on 7 degrees of freedom | + | Residual standard error: 0.179 on 8 degrees of freedom |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
> | > | ||
</ | </ | ||
+ | 이제는 그 R-squared 값이 0.559임을 알게 되었다. 이의 제곱근은 $\sqrt{0.559} = 0.7477$ 이고, 이것이 SATV의 영향력을 IV, DV 모두에게서 제거한 후에 IV의 (HSGPA만의) 영향력을 보는 방법이라는 것을 안다. | ||
- | '' | + | [{{: |
- | '' | + | |
- | '' | + | < |
- | '' | + | pcor.test(FGPA, HSGPA, SATV) |
- | '' | + | </ |
+ | < | ||
+ | estimate | ||
+ | 1 0.7476 0.02057 2.978 10 1 pearson | ||
+ | > </ | ||
- | One other thing that we could do help determine a pragmatic argument is to regress GPA on both SAT and CLEP at the same time to see what happens. If we do that, we find that R-square for the model is .78, F = 12.25, p < .01. The intercept and b weight for CLEP are both significant, but the b weight for SAT is not significant. The values are | + | <fc # |
- | * '' | ||
- | * '' | ||
- | * '' | ||
- | In this case, we would conclude that the significant unique predictor is CLEP. Although SAT is highly correlated with GPA, it adds nothing to the prediction equation once the CLEP score is entered. (These data are fictional and the sample size is much too small to run this analysis. It's there for illustration only.) | ||
- | Now suppose we wanted to argue something a little different. Suppose we had a theory that said that all measures of math achievement share a common explanation, | + | 또한 위의 설명은 [[: |
+ | |||
+ | 아래의 결과를 살펴보면 anova() 결과 독립변인들의 p value 들과 summary() 에서의 독립변인들의 p value가 다른 이유가 다르다. | ||
- | |||
- | ===== checking partial cor 1 ===== | ||
< | < | ||
- | # get res.lm.clep.sat | + | # anova()에서의 결과 |
- | lm.sat.clep <- lm(sat ~ clep, data = tests) | + | acs_k3 |
- | summary(lm.sat.clep) | + | # summary(lm())에서의 결과 |
+ | acs_k3 | ||
</ | </ | ||
+ | 아래는 [[: | ||
< | < | ||
+ | dvar <- read.csv(" | ||
+ | mod <- lm(api00 ~ ell + acs_k3 + avg_ed + meals, data=dvar) | ||
+ | summary(mod) | ||
+ | anova(mod) | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | > dvar <- read.csv(" | ||
+ | > mod <- lm(api00 ~ ell + acs_k3 + avg_ed + meals, data=dvar) | ||
+ | > summary(mod) | ||
+ | |||
Call: | Call: | ||
- | lm(formula = sat ~ clep, data = tests) | + | lm(formula = api00 ~ ell + acs_k3 + avg_ed + meals, data = dvar) |
Residuals: | Residuals: | ||
| | ||
- | -101.860 -19.292 1.136 | + | -187.020 -40.358 |
Coefficients: | Coefficients: | ||
Estimate Std. Error t value Pr(> | Estimate Std. Error t value Pr(> | ||
- | (Intercept) | + | (Intercept) |
- | clep 17.665 3.464 | + | ell |
+ | acs_k3 | ||
+ | avg_ed | ||
+ | meals -2.9374 | ||
--- | --- | ||
- | Signif. codes: | + | Signif. codes: |
- | 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 | + | |
- | Residual standard error: | + | Residual standard error: |
- | Multiple R-squared: | + | (21 observations deleted due to missingness) |
- | F-statistic: | + | Multiple R-squared: |
+ | F-statistic: | ||
- | > </ | + | > anova(mod) |
- | {{lm.sat.clep.png? | + | Analysis of Variance Table |
- | < | + | |
- | res.lm.sat.clep <- lm.sat.clep$residuals | + | |
- | </ | + | |
- | < | + | Response: api00 |
- | install.packages(" | + | Df Sum Sq Mean Sq F value Pr(>F) |
- | library(ppcor) | + | ell 1 4502711 4502711 1309.762 < 2.2e-16 *** |
+ | acs_k3 | ||
+ | avg_ed | ||
+ | meals | ||
+ | Residuals 374 1285740 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | > | ||
</ | </ | ||
- | < | + | |
- | pcor.gpa.sat.clep <- lm(res.lm.gpa.clep | + | 아래에서 mod2를 보면 |
- | summary(pcor.gpa.sat.clep) | + | * api00이 종속변인이고, |
- | </code> | + | * ell, avg_ed, meals + acs_k3 가 독립변인인데, |
- | <code> | + | * 그 순서가 이전 문서의 |
+ | * ell, acs_k3, avg_ed, meals에서 바뀐것을 알 수 있다 (acs.k3가 맨 뒤로 감). | ||
+ | * 즉, | ||
+ | * lm(api00 ~ ell + acs_k3 + avg_ed + meals) | ||
+ | * lm(api00 ~ ell + avg_ed + meals + acs_k3) | ||
+ | |||
+ | anova는 독립변인에 대한 영향력을 다른 IV들을 고려하지 않고, 그냥 입력 순서대로 처리하므로, | ||
+ | |||
+ | < | ||
+ | > summary(mod2) | ||
Call: | Call: | ||
- | lm(formula = res.lm.gpa.clep | + | lm(formula = api00 ~ ell + avg_ed + meals + acs_k3, data = dvar) |
Residuals: | Residuals: | ||
- | | + | |
- | -0.197888 | + | -186.90 |
Coefficients: | Coefficients: | ||
- | | + | |
- | (Intercept) | + | (Intercept) |
- | res.lm.sat.clep | + | ell |
+ | avg_ed | ||
+ | meals -2.948 0.196 | ||
+ | acs_k3 | ||
+ | --- | ||
+ | Signif. codes: | ||
- | Residual standard error: | + | Residual standard error: |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
- | </ | ||
- | |||
- | {{pcor.gpa.sat.clep.png? | ||
- | |||
- | < | ||
- | > pcor.gpa.sat.clep <- pcor.test(gpa, | ||
- | > pcor.gpa.sat.clep | ||
- | estimate | ||
- | 1 -0.2064849 0.5940128 | ||
- | > pcor.gpa.sat.clep$estimate^2 | ||
- | [1] 0.04263601 | ||
> | > | ||
+ | > anova(mod2) | ||
+ | Analysis of Variance Table | ||
+ | |||
+ | Response: api00 | ||
+ | | ||
+ | ell 1 4502711 4502711 1309.762 <2e-16 *** | ||
+ | avg_ed | ||
+ | meals | ||
+ | acs_k3 | ||
+ | Residuals 374 1285740 | ||
+ | --- | ||
+ | Signif. codes: | ||
</ | </ | ||
- | < | + | 이는 다른 독립변인들의 순서를 바꾸어도 마찬가지이다. mod3은 mod2에서 meals변인을 맨 앞으로 옮긴 예이다. 즉 |
- | plot(d)</ | + | |
- | {{pcor.sat.clep.gpa.res.png}} | + | * mod <- lm(api00 ~ ell + acs_k3 + avg_ed + meals) |
+ | * mod2 <- lm(api00 ~ ell + avg_ed + meals + acs_k3) | ||
+ | * mod3 <- lm(api00 ~ meals + ell + avg_ed + acs_k3) | ||
- | Note that the relationship between res.lm.gpa.clep and sat look like negative, which can be confirmed in the lm.gpa.satclep < | + | summary(mod), summary(mod2), summary(mod3)의 결과는 서로 다르지 않지만, anova의 결과는 어떤 독립변인이 앞으로 오는가에 따라서 그 f값과 |
- | ===== checking partial cor 2 ===== | + | |
< | < | ||
- | > # import test score data " | + | > mod3 <- lm(api00 ~ meals + ell + avg_ed + acs_k3, data=dvar) |
- | > tests <- read.csv(" | + | > summary(mod3) |
- | > colnames(tests) <- c(" | + | |
- | > tests <- subset(tests, | + | |
- | > attach(tests) | + | |
- | > cor(tests) | + | |
- | | + | |
- | sat 1.0000000 0.8745001 0.7180459 | + | |
- | clep 0.8745001 1.0000000 0.8762720 | + | |
- | gpa 0.7180459 0.8762720 1.0000000 | + | |
- | </ | + | |
- | $$ | + | Call: |
- | r_{12.3} = \frac {r_{12} - r_{13} r_{23} } {\sqrt{1-r_{13}^2} \sqrt{1-r_{23}^2}} | + | lm(formula |
- | $$ | + | |
- | (1 = GPA, 2 = CLEP, 3 = SAT) | + | |
- | \begin{eqnarray*} | + | Residuals: |
- | r_{\text{gpaclep.sat}} & = & \frac {r_{\text{gpaclep}} - r_{\text{gpasat}} r_{\text{clepsat}} } {\sqrt{1-r_{\text{gpasat}}^2} \sqrt{1-r_{\text{clepsat}}^2}} \\ | + | |
- | & = & \frac {0.8762720 | + | -186.90 |
- | & = & .73 | + | |
- | \end{eqnarray*} | + | |
- | $$ | + | Coefficients: |
- | r_{12.3} = \frac {r_{12} - r_{13} r_{23} } {\sqrt{1-r_{13}^2} \sqrt{1-r_{23}^2}} | + | |
- | $$ | + | (Intercept) |
- | (1 = gpa, 2 = sat, 3 = clep) | + | meals -2.948 0.196 |
+ | ell | ||
+ | avg_ed | ||
+ | acs_k3 | ||
+ | --- | ||
+ | Signif. codes: | ||
- | \begin{eqnarray*} | + | Residual standard error: 58.8 on 371 degrees of freedom |
- | r_{\text{gpasat.clep}} & = & \frac {r_{\text{gpasat}} - r_{\text{gpaclep}} r_{\text{satclep}} } {\sqrt{1-r_{\text{gpaclep}}^2} \sqrt{1-r_{\text{satclep}}^2}} \\ | + | Multiple R-squared: |
- | & = & \frac {0.7180459 | + | F-statistic: |
- | & = & 0.04263585 | + | |
- | \end{eqnarray*} | + | |
- | <code>> cor(tests) | + | > anova(mod2) |
- | sat clep gpa | + | Analysis of Variance Table |
- | sat 1.0000000 0.8745001 0.7180459 | + | |
- | clep 0.8745001 | + | Response: api00 |
- | gpa 0.7180459 | + | |
- | > round(cor(tests), | + | ell 1 4480281 4480281 1297.34 <2e-16 *** |
- | | + | avg_ed |
- | sat 1.0000 0.8745 0.7180 | + | meals |
- | clep 0.8745 | + | acs_k3 |
- | gpa 0.7180 0.8763 1.0000 | + | Residuals 371 1281224 |
- | > c<- (.7180459-(.8762720*.8745001)) | + | --- |
- | > d <- (sqrt(1-.8762720^2) | + | Signif. codes: |
- | > c/d | + | > |
- | [1] -0.2064845 | + | > |
- | > (c/d)^2 | + | > anova(mod3) |
- | [1] 0.04263585 | + | Analysis of Variance Table |
+ | |||
+ | Response: api00 | ||
+ | | ||
+ | meals 1 6219897 6219897 1801.08 < 2e-16 *** | ||
+ | ell | ||
+ | avg_ed | ||
+ | acs_k3 | ||
+ | Residuals 371 1281224 | ||
+ | --- | ||
+ | Signif. codes: | ||
> </ | > </ | ||
- | ====== Semipartial cor ====== | ||
- | See also [[: | ||
- | < | + | ====== e.g. Using ppcor.test with 4 var ====== |
- | > colnames(tests) <- c(" | + | |
- | > tests <- subset(tests, | + | |
- | > attach(tests) | + | |
- | > cors <- cor(tests) | + | |
- | > round(cors, | + | |
- | | + | |
- | sat 1.000 0.875 0.718 | + | |
- | clep 0.875 1.000 0.876 | + | |
- | gpa 0.718 0.876 1.000 | + | |
- | > lm.sat.clep <- lm(sat ~ clep, data = tests) | + | |
- | > summary(lm.sat.clep) | + | |
- | Call: | + | < |
- | lm(formula | + | options(digits |
- | Residuals: | + | HSGPA <- c(3.0, 3.2, 2.8, 2.5, 3.2, 3.8, 3.9, 3.8, 3.5, 3.1) |
- | | + | FGPA <- c(2.8, 3.0, 2.8, 2.2, 3.3, 3.3, 3.5, 3.7, 3.4, 2.9) |
- | -101.860 -19.292 | + | SATV <- c(500, 550, 450, 400, 600, 650, 700, 550, 650, 550) |
+ | GREV <- c(600, 670, 540, 800, 750, 820, 830, 670, 690, 600) | ||
+ | ##GREV <- c(510, 670, 440, 800, 750, 420, 830, 470, 690, 600) | ||
- | Coefficients: | ||
- | Estimate Std. Error t value Pr(> | ||
- | (Intercept) | ||
- | clep 17.665 | ||
- | --- | ||
- | Signif. codes: | ||
- | Residual standard error: 48.2 on 8 degrees of freedom | + | scholar <- data.frame(HSGPA, FGPA, SATV, GREV) # collect into a data frame |
- | Multiple R-squared: | + | # install.packages(" |
- | F-statistic: | + | library(psych) |
+ | describe(scholar) # provides descrptive information about each variable | ||
- | > res.lm.sat.clep | + | corrs <- cor(scholar) # find the correlations and set them into an object called ' |
- | > | + | corrs # print corrs |
- | > install.packages(" | + | |
- | > library(ppcor) | + | |
- | Loading required package: MASS | + | |
- | > # regression test for semipartial correlation | + | pairs(scholar) # pairwise scatterplots |
- | > spcor.gpa.sat.clep <- lm(gpa ~ res.lm.sat.clep) | + | |
- | > summary(spcor.gpa.sat.clep) | + | |
- | Call: | + | # install.packages(" |
- | lm(formula = gpa ~ res.lm.sat.clep) | + | library(ppcor) |
+ | pcor.test(scholar$GREV, | ||
- | Residuals: | + | reg3 <- lm(GREV ~ SATV + HSGPA) |
- | | + | resid3 <- resid(reg3) |
- | -0.3756 -0.2694 | + | |
- | Coefficients: | + | reg4 <- lm(FGPA ~ SATV + HSGPA) # second regression |
- | Estimate Std. Error t value Pr(>|t|) | + | resid4 <- resid(reg4) # second set of residuals |
- | (Intercept) | + | |
- | res.lm.sat.clep -0.0007015 | + | |
- | --- | + | |
- | Signif. codes: | + | |
- | Residual standard error: 0.3382 on 8 degrees | + | cor(resid3, resid4) |
- | Multiple R-squared: | + | |
- | F-statistic: | + | |
</ | </ | ||
- | From the above: Multiple R-squared: 0.009898 | + | < |
- | From the below: spcor.gpa.sat.clep%%$%%estimate^2: 0.009897835 | + | > pcor.test(scholar$GREV, |
+ | | ||
+ | 1 -0.535 0.1719 -1.551 10 | ||
+ | > | ||
+ | > reg3 <- lm(GREV ~ SATV + HSGPA) | ||
+ | > resid3 <- resid(reg3) | ||
+ | > | ||
+ | > reg4 <- lm(FGPA ~ SATV + HSGPA) | ||
+ | > resid4 <- resid(reg4) | ||
+ | > | ||
+ | > cor(resid3, resid4) | ||
+ | [1] -0.535 | ||
- | < | ||
- | > spcor.gpa.sat.clep | ||
- | | ||
- | 1 -0.09948786 0.7989893 -0.2645326 10 1 pearson | ||
- | > spcor.gpa.sat.clep$estimate^2 | ||
- | [1] 0.009897835 | ||
- | > </ | ||
+ | </ | ||
+ | ---- | ||
+ | ---- | ||
+ | 학자인 A는 GRE점수는 (GREV) 학점에 신경을 쓰는 활동보다는 지능지수와 관련된다고 믿는 SATV의 영향력이 더 클것으로 생각된다. 그래서 SATV만의 영향력을 다른 변인을 콘트롤하여 살펴보고 싶다. | ||
+ | < | ||
+ | pcor.test(scholar$GREV, | ||
+ | reg7 <- lm(GREV ~ HSGPA + FGPA) # run linear regression | ||
+ | resid7 <- resid(reg7) | ||
+ | |||
+ | reg8 <- lm(SATV ~ HSGPA+ FGPA) # second regression | ||
+ | resid8 <- resid(reg8) | ||
+ | |||
+ | cor(resid7, resid8) | ||
+ | |||
+ | </ | ||
+ | |||
+ | < | ||
+ | > pcor.test(scholar$GREV, | ||
+ | estimate p.value statistic | ||
+ | 1 | ||
+ | > | ||
+ | > reg7 <- lm(GREV ~ HSGPA + FGPA) # run linear regression | ||
+ | > resid7 <- resid(reg7) | ||
+ | > | ||
+ | > reg8 <- lm(SATV ~ HSGPA+ FGPA) # second regression | ||
+ | > resid8 <- resid(reg8) | ||
+ | > | ||
+ | > cor(resid7, resid8) | ||
+ | [1] 0.3179 | ||
+ | > | ||
+ | > | ||
+ | </ | ||
- | ====== e.g., ====== | + | ====== e.g., 독립변인 들이 서로 독립적일 때의 각각의 설명력 |
In this example, the two IVs are orthogonal to each other (not correlated with each other). Hence, regress res.y.x2 against x1 would not result in any problem. | In this example, the two IVs are orthogonal to each other (not correlated with each other). Hence, regress res.y.x2 against x1 would not result in any problem. | ||
< | < | ||
Line 526: | Line 573: | ||
F-statistic: | F-statistic: | ||
> </ | > </ | ||
- | .... | ||
- | Std. Error for x1 | ||
- | SSres/n-2 = | ||
- | Std. Error for x2 | ||
- | .... | + | |
< | < | ||
lm.y.x2 <- lm(y ~ x2) | lm.y.x2 <- lm(y ~ x2) | ||
Line 559: | Line 602: | ||
</ | </ | ||
< | < | ||
- | d <- data.frame(X1=x1, | + | d <- data.frame(X1=x1, |
plot(d) | plot(d) | ||
</ | </ | ||
- | {{partial.r.regression.png}} | + | {{: |
< | < | ||
- | [1] 0.0073</ | + | [1] 0.0073 |
+ | </ | ||
X2이 설명하는 Y분산의 나머지를 (1-R< | X2이 설명하는 Y분산의 나머지를 (1-R< | ||
Line 600: | Line 645: | ||
x2의 영향력을 control한 후에 x1영향력을 보면 64.54%에 달하게 된다. | x2의 영향력을 control한 후에 x1영향력을 보면 64.54%에 달하게 된다. | ||
+ | ====== X1과 X2 간의 상관관계가 심할 때 Regression 결과의 오류 ====== | ||
+ | see https:// | ||
+ | |||
+ | < | ||
+ | RSS = 3:10 # 오른 발 사이즈 | ||
+ | LSS = rnorm(RSS, RSS, 0.1) # 왼발 사이즈 - 오른 발과 매우 유사 | ||
+ | cor(LSS, RSS) # correlation ~ 0.99 | ||
+ | |||
+ | weights = 120 + rnorm(RSS, 10*RSS, 10) | ||
+ | |||
+ | # Fit a joint model | ||
+ | m <- lm(weights ~ LSS + RSS) | ||
+ | |||
+ | ## F-value is very small, but neither LSS or RSS are significant | ||
+ | summary(m) | ||
+ | |||
+ | ## Fitting RSS or LSS separately gives a significant result. | ||
+ | summary(lm(weights ~ LSS)) | ||
+ | ## or | ||
+ | summary(lm(weights ~ RSS)) | ||
- | ====== e.g. ====== | ||
- | <code csv eg.b.csv> | ||
- | y x1 x2 | ||
- | 1.644540 1.063845 .351188 | ||
- | 1.785204 1.203146 .200000 | ||
- | -1.36357 -.466514 -.961069 | ||
- | .314549 1.175054 .800000 | ||
- | .317955 .100612 .858597 | ||
- | .970097 2.438904 1.000000 | ||
- | .664388 1.204048 .292670 | ||
- | -.870252 -.993857 -1.89018 | ||
- | 1.962192 .587540 -.275352 | ||
- | 1.036381 -.110834 -.246448 | ||
- | .007415 -.069234 1.447422 | ||
- | 1.634353 .965370 .467095 | ||
- | .219813 .553268 .348095 | ||
- | -.285774 .358621 .166708 | ||
- | 1.498758 -2.87971 -1.13757 | ||
- | 1.671538 -.310708 .396034 | ||
- | 1.462036 .057677 1.401522 | ||
- | -.563266 .904716 -.744522 | ||
- | .297874 .561898 -.929709 | ||
- | -1.54898 -.898084 -.838295 | ||
</ | </ | ||
- | < | + | |
- | y x1 x2 | + | < |
- | 1.644540 1.063845 .351188 | + | > LSS = rnorm(RSS, RSS, 0.1) #Left shoe size - similar to RSS |
- | 1.785204 -1.20315 .200000 | + | > cor(LSS, RSS) # |
- | -1.36357 -.466514 -.961069 | + | [1] 0.9994836 |
- | .314549 1.175054 .800000 | + | > |
- | .317955 -.100612 .858597 | + | > weights = 120 + rnorm(RSS, 10*RSS, 10) |
- | .970097 1.438904 1.000000 | + | > |
- | .664388 1.204048 .292670 | + | > ##Fit a joint model |
- | -.870252 -.993857 -1.89018 | + | > m = lm(weights ~ LSS + RSS) |
- | 1.962192 -.587540 -.275352 | + | > |
- | 1.036381 -.110834 -.246448 | + | > ##F-value is very small, but neither LSS or RSS are significant |
- | .007415 -.069234 1.447422 | + | > summary(m) |
- | 1.634353 .965370 .467095 | + | |
- | .219813 .553268 .348095 | + | Call: |
- | -.285774 .358621 .166708 | + | lm(formula = weights ~ LSS + RSS) |
- | 1.498758 -2.87971 -1.13757 | + | |
- | 1.671538 -.810708 .396034 | + | Residuals: |
- | 1.462036 -.057677 1.401522 | + | |
- | -.563266 .904716 -.744522 | + | 4.8544 4.5254 -3.6333 -7.6402 -0.2467 -3.1997 -5.2665 10.6066 |
- | .297874 .561898 -.929709 | + | |
- | -1.54898 -1.26108 -.838295 | + | Coefficients: |
+ | | ||
+ | (Intercept) | ||
+ | LSS -14.162 | ||
+ | RSS 26.305 35.034 0.751 0.487 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 7.296 on 5 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: 59.92 on 2 and 5 DF, p-value: 0.000321 | ||
+ | |||
+ | > | ||
+ | > ##Fitting RSS or LSS separately gives a significant result. | ||
+ | > summary(lm(weights ~ LSS)) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = weights ~ LSS) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -6.055 -4.930 -2.925 4.886 11.854 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | LSS 12.440 1.097 11.34 2.81e-05 *** | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 7.026 on 6 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | > | ||
+ | > | ||
+ | > ## or | ||
+ | > summary(lm(weights ~ RSS)) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = weights ~ RSS) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -13.46 -4.44 1.61 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | RSS 9.33 1.27 7.34 0.00033 *** | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 8.24 on 6 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: 53.9 on 1 and 6 DF, p-value: 0.000327 | ||
+ | |||
+ | > | ||
</ | </ | ||
- | ====== e.g. 3, ====== | + | |
+ | |||
+ | |||
+ | |||
+ | |||
+ | ====== e. g. API00 data using ppcor package | ||
+ | This part is an extension of the [[:multiple regression# | ||
+ | |||
+ | We are going to use spcor for identifying the effect of each IV. In order to do that, we need to reconstruct the data with the involved variable. | ||
< | < | ||
- | set.seed(888) # for reproducibility | + | dvar <- read.csv(" |
- | S = rnorm(60, mean=0, sd=1.0) # the Suppressor is normally distributed | + | mod <- lm(api00 ~ ell + acs_k3 + avg_ed + meals, data=dvar) |
- | U = 1.1*S + rnorm(60, mean=0, sd=0.1) # U (unrelated) is Suppressor plus error | + | summary(mod) |
- | R <- rnorm(60, mean=0, sd=1.0) # related part; normally distributed | + | anova(mod) |
- | OV = U + R # the Other Variable is U plus R | + | |
- | Y = R + rnorm(60, mean=0, sd=2) # Y is R plus error | + | attach(dvar) |
+ | da1 <- data.frame(api00, ell, acs_k3, avg_ed, meals) | ||
+ | da2 <- data.frame(api00, ell, avg_ed, meals) | ||
+ | |||
+ | da1 <- na.omit(da1) | ||
+ | da2 <- na.omit(da2) | ||
+ | |||
+ | spcor(da1) | ||
+ | spcor(da2) | ||
</ | </ | ||
+ | |||
+ | < | ||
+ | > spcor(da1) | ||
+ | $estimate | ||
+ | | ||
+ | api00 | ||
+ | ell -0.13469956 | ||
+ | acs_k3 | ||
+ | avg_ed | ||
+ | meals -0.29972194 | ||
+ | |||
+ | $p.value | ||
+ | api00 ell acs_k3 | ||
+ | api00 0.000000e+00 0.07761805 0.5525340 0.085390280 2.403284e-10 | ||
+ | ell 8.918743e-03 0.00000000 0.2390272 0.232377348 1.558141e-03 | ||
+ | acs_k3 1.608778e-01 0.05998819 0.0000000 0.009891503 7.907183e-03 | ||
+ | avg_ed 1.912418e-02 0.27203887 0.1380449 0.000000000 7.424903e-05 | ||
+ | meals 3.041658e-09 0.04526574 0.2919775 0.006489783 0.000000e+00 | ||
+ | |||
+ | $statistic | ||
+ | | ||
+ | api00 | ||
+ | ell -2.628924 | ||
+ | acs_k3 | ||
+ | avg_ed | ||
+ | meals -6.075665 | ||
+ | |||
+ | $n | ||
+ | [1] 379 | ||
+ | |||
+ | $gp | ||
+ | [1] 3 | ||
+ | |||
+ | $method | ||
+ | [1] " | ||
+ | |||
+ | > spcor(da2) | ||
+ | $estimate | ||
+ | api00 | ||
+ | api00 | ||
+ | ell -0.1331295 | ||
+ | avg_ed | ||
+ | meals -0.3170300 | ||
+ | |||
+ | $p.value | ||
+ | api00 ell avg_ed | ||
+ | api00 0.000000e+00 0.08053412 0.099035570 2.160270e-11 | ||
+ | ell 9.465897e-03 0.00000000 0.172705169 2.366624e-03 | ||
+ | avg_ed 2.340012e-02 0.20663211 0.000000000 1.158869e-04 | ||
+ | meals 2.698723e-10 0.05296417 0.008170237 0.000000e+00 | ||
+ | |||
+ | $statistic | ||
+ | | ||
+ | api00 | ||
+ | ell -2.608123 | ||
+ | avg_ed | ||
+ | meals -6.490414 | ||
+ | |||
+ | $n | ||
+ | [1] 381 | ||
+ | |||
+ | $gp | ||
+ | [1] 2 | ||
+ | |||
+ | $method | ||
+ | [1] " | ||
+ | |||
+ | > | ||
+ | </ | ||
+ | ====== e.g. mpg model in mtcars (in r) ====== | ||
+ | {{: |
partial_and_semipartial_correlation.1571586834.txt.gz · Last modified: 2019/10/21 00:53 by hkimscil