====== Multi-colinearity check in r ====== required library: * corrplot * mctest * omcdiag * imcdiag > cps <- read.csv("http://commres.net/wiki/_media/cps_85_wages.csv", header = T, sep = "\t") > str(cps) 'data.frame': 534 obs. of 11 variables: $ education : int 8 9 12 12 12 13 10 12 16 12 ... $ south : int 0 0 0 0 0 0 1 0 0 0 ... $ sex : int 1 1 0 0 0 0 0 0 0 0 ... $ experience: int 21 42 1 4 17 9 27 9 11 9 ... $ union : int 0 0 0 0 0 1 0 0 0 0 ... $ wage : num 5.1 4.95 6.67 4 7.5 ... $ age : int 35 57 19 22 35 28 43 27 33 27 ... $ race : int 2 3 3 3 3 3 3 3 3 3 ... $ occupation: int 6 6 6 6 6 6 6 6 6 6 ... $ sector : int 1 1 1 0 0 0 0 0 1 0 ... $ marr : int 1 1 0 0 1 0 0 0 1 0 ... > head(cps) > head(cps) education south sex experience union wage age race occupation sector marr 1 8 0 1 21 0 5.10 35 2 6 1 1 2 9 0 1 42 0 4.95 57 3 6 1 1 3 12 0 0 1 0 6.67 19 3 6 1 0 4 12 0 0 4 0 4.00 22 3 6 0 0 5 12 0 0 17 0 7.50 35 3 6 0 1 6 13 0 0 9 1 13.07 28 3 6 0 0 > lm1 = lm(log(cps$wage) ~., data = cps) > summary(lm1) Call: lm(formula = log(cps$wage) ~ ., data = cps) Residuals: Min 1Q Median 3Q Max -2.16246 -0.29163 -0.00469 0.29981 1.98248 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.078596 0.687514 1.569 0.117291 education 0.179366 0.110756 1.619 0.105949 south -0.102360 0.042823 -2.390 0.017187 * sex -0.221997 0.039907 -5.563 4.24e-08 *** experience 0.095822 0.110799 0.865 0.387531 union 0.200483 0.052475 3.821 0.000149 *** age -0.085444 0.110730 -0.772 0.440671 race 0.050406 0.028531 1.767 0.077865 . occupation -0.007417 0.013109 -0.566 0.571761 sector 0.091458 0.038736 2.361 0.018589 * marr 0.076611 0.041931 1.827 0.068259 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.4398 on 523 degrees of freedom Multiple R-squared: 0.3185, Adjusted R-squared: 0.3054 F-statistic: 24.44 on 10 and 523 DF, p-value: < 2.2e-16 plot(lm1) {{lm1.plot1.png?300}}{{lm1.plot2.png?300}} {{lm1.plot3.png?300}}{{lm1.plot4.png?300}} > library(corrplot) > cps.cor = cor(cps) > corrplot.mixed(cps.cor, lower.col = "black") {{cps.corrplot.png?500}} > install.packages("mctest") > library(mctest) > omcdiag(cps[,c(-6)], cps$wage) # or "omcdiag(cps[,c(1:5,7:11)], cps$wage)" will work as well. Call: omcdiag(x = cps[, c(-6)], y = cps$wage) Overall Multicollinearity Diagnostics MC Results detection Determinant |X'X|: 0.0001 1 Farrar Chi-Square: 4833.5751 1 Red Indicator: 0.1983 0 Sum of Lambda Inverse: 10068.8439 1 Theil's Method: 1.2263 1 Condition Number: 739.7337 1 1 --> COLLINEARITY is detected by the test 0 --> COLLINEARITY is not detected by the test > > imcdiag(cps[,c(-6)],cps$wage) Call: imcdiag(x = cps[, c(-6)], y = cps$wage) All Individual Multicollinearity Diagnostics Result VIF TOL Wi Fi Leamer CVIF Klein education 231.1956 0.0043 13402.4982 15106.5849 0.0658 236.4725 1 south 1.0468 0.9553 2.7264 3.0731 0.9774 1.0707 0 sex 1.0916 0.9161 5.3351 6.0135 0.9571 1.1165 0 experience 5184.0939 0.0002 301771.2445 340140.5368 0.0139 5302.4188 1 union 1.1209 0.8922 7.0368 7.9315 0.9445 1.1464 0 age 4645.6650 0.0002 270422.7164 304806.1391 0.0147 4751.7005 1 race 1.0371 0.9642 2.1622 2.4372 0.9819 1.0608 0 occupation 1.2982 0.7703 17.3637 19.5715 0.8777 1.3279 0 sector 1.1987 0.8343 11.5670 13.0378 0.9134 1.2260 0 marr 1.0961 0.9123 5.5969 6.3085 0.9551 1.1211 0 1 --> COLLINEARITY is detected by the test 0 --> COLLINEARITY is not detected by the test education , south , experience , age , race , occupation , sector , marr , coefficient(s) are non-significant may be due to multicollinearity R-square of y on all x: 0.2805 * use method argument to check which regressors may be the reason of collinearity =================================== > > round(pcor(cps[,c(-6)], method = "pearson")$estimate,4) education south sex experience union age race occupation sector marr education 1.0000 -0.0318 0.0515 -0.9976 -0.0075 0.9973 0.0172 0.0294 -0.0213 -0.0403 south -0.0318 1.0000 -0.0302 -0.0223 -0.0975 0.0215 -0.1112 0.0084 -0.0215 0.0304 sex 0.0515 -0.0302 1.0000 0.0550 -0.1201 -0.0537 0.0200 -0.1428 -0.1121 0.0042 experience -0.9976 -0.0223 0.0550 1.0000 -0.0102 0.9999 0.0109 0.0421 -0.0133 -0.0410 union -0.0075 -0.0975 -0.1201 -0.0102 1.0000 0.0122 -0.1077 0.2130 -0.0135 0.0689 age 0.9973 0.0215 -0.0537 0.9999 0.0122 1.0000 -0.0108 -0.0441 0.0146 0.0451 race 0.0172 -0.1112 0.0200 0.0109 -0.1077 -0.0108 1.0000 0.0575 0.0064 0.0556 occupation 0.0294 0.0084 -0.1428 0.0421 0.2130 -0.0441 0.0575 1.0000 0.3147 -0.0186 sector -0.0213 -0.0215 -0.1121 -0.0133 -0.0135 0.0146 0.0064 0.3147 1.0000 0.0365 marr -0.0403 0.0304 0.0042 -0.0410 0.0689 0.0451 0.0556 -0.0186 0.0365 1.0000 > lm2 = lm(log(cps$wage) ~ . -age , data = cps) > summary(lm2) Call: lm(formula = log(cps$wage) ~ . - age, data = cps) Residuals: Min 1Q Median 3Q Max -2.16044 -0.29073 -0.00505 0.29994 1.97997 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.562676 0.160116 3.514 0.000479 *** education 0.094135 0.008188 11.497 < 2e-16 *** south -0.103071 0.042796 -2.408 0.016367 * sex -0.220344 0.039834 -5.532 5.02e-08 *** experience 0.010335 0.001746 5.919 5.86e-09 *** union 0.199987 0.052450 3.813 0.000154 *** race 0.050643 0.028519 1.776 0.076345 . occupation -0.006971 0.013091 -0.532 0.594619 sector 0.091022 0.038717 2.351 0.019094 * marr 0.075152 0.041872 1.795 0.073263 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.4397 on 524 degrees of freedom Multiple R-squared: 0.3177, Adjusted R-squared: 0.306 F-statistic: 27.11 on 9 and 524 DF, p-value: < 2.2e-16 > summary(lm1) Call: lm(formula = log(cps$wage) ~ ., data = cps) Residuals: Min 1Q Median 3Q Max -2.16246 -0.29163 -0.00469 0.29981 1.98248 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.078596 0.687514 1.569 0.117291 education 0.179366 0.110756 1.619 0.105949 south -0.102360 0.042823 -2.390 0.017187 * sex -0.221997 0.039907 -5.563 4.24e-08 *** experience 0.095822 0.110799 0.865 0.387531 union 0.200483 0.052475 3.821 0.000149 *** age -0.085444 0.110730 -0.772 0.440671 race 0.050406 0.028531 1.767 0.077865 . occupation -0.007417 0.013109 -0.566 0.571761 sector 0.091458 0.038736 2.361 0.018589 * marr 0.076611 0.041931 1.827 0.068259 . --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.4398 on 523 degrees of freedom Multiple R-squared: 0.3185, Adjusted R-squared: 0.3054 F-statistic: 24.44 on 10 and 523 DF, p-value: < 2.2e-16 > > ====== regression test with factors ====== > cps$sex <- factor(cps$sex) > cps$union <- factor(cps$union) > cps$race <- factor(cps$race) > cps$sector <- factor(cps$sector) > cps$occupation <- factor(cps$occupation) > cps$marr <- factor(cps$marr) > str(cps) 'data.frame': 534 obs. of 11 variables: $ education : int 8 9 12 12 12 13 10 12 16 12 ... $ south : int 0 0 0 0 0 0 1 0 0 0 ... $ sex : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ... $ experience: int 21 42 1 4 17 9 27 9 11 9 ... $ union : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ... $ wage : num 5.1 4.95 6.67 4 7.5 ... $ age : int 35 57 19 22 35 28 43 27 33 27 ... $ race : Factor w/ 3 levels "1","2","3": 2 3 3 3 3 3 3 3 3 3 ... $ occupation: Factor w/ 6 levels "1","2","3","4",..: 6 6 6 6 6 6 6 6 6 6 ... $ sector : Factor w/ 3 levels "0","1","2": 2 2 2 1 1 1 1 1 2 1 ... $ marr : Factor w/ 2 levels "0","1": 2 2 1 1 2 1 1 1 2 1 ... > lm4 = lm(log(cps$wage) ~ . -age, data = cps) > summary(lm4) Call: lm(formula = log(cps$wage) ~ . - age, data = cps) Residuals: Min 1Q Median 3Q Max -2.36103 -0.28080 0.00362 0.27793 1.79594 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.194821 0.181804 6.572 1.21e-10 *** education 0.066603 0.010060 6.621 8.96e-11 *** south -0.093384 0.041931 -2.227 0.02637 * sex1 -0.216934 0.041844 -5.184 3.11e-07 *** experience 0.009371 0.001725 5.431 8.63e-08 *** union1 0.211506 0.051218 4.129 4.24e-05 *** race2 -0.033928 0.099051 -0.343 0.73209 race3 0.079851 0.057392 1.391 0.16472 occupation2 -0.364444 0.091500 -3.983 7.78e-05 *** occupation3 -0.210295 0.076175 -2.761 0.00597 ** occupation4 -0.383882 0.080990 -4.740 2.77e-06 *** occupation5 -0.050664 0.072717 -0.697 0.48628 occupation6 -0.265348 0.079969 -3.318 0.00097 *** sector1 0.114857 0.054862 2.094 0.03678 * sector2 0.093138 0.096514 0.965 0.33499 marr1 0.062211 0.041025 1.516 0.13002 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.4278 on 518 degrees of freedom Multiple R-squared: 0.3614, Adjusted R-squared: 0.3429 F-statistic: 19.54 on 15 and 518 DF, p-value: < 2.2e-16 > > lm5 = lm(log(cps$wage) ~ . -age -race, data = cps) > summary(lm5) Call: lm(formula = log(cps$wage) ~ . - age - race, data = cps) Residuals: Min 1Q Median 3Q Max -2.34366 -0.28169 -0.00017 0.29179 1.81158 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.224289 0.172070 7.115 3.73e-12 *** education 0.068838 0.009912 6.945 1.14e-11 *** south -0.102588 0.041668 -2.462 0.014139 * sex1 -0.213602 0.041842 -5.105 4.65e-07 *** experience 0.009494 0.001723 5.510 5.65e-08 *** union1 0.202720 0.051009 3.974 8.06e-05 *** occupation2 -0.355381 0.091448 -3.886 0.000115 *** occupation3 -0.209820 0.076149 -2.755 0.006068 ** occupation4 -0.385680 0.080855 -4.770 2.40e-06 *** occupation5 -0.047694 0.072746 -0.656 0.512351 occupation6 -0.254277 0.079781 -3.187 0.001523 ** sector1 0.111458 0.054845 2.032 0.042636 * sector2 0.099777 0.096481 1.034 0.301541 marr1 0.065464 0.041036 1.595 0.111257 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.4283 on 520 degrees of freedom Multiple R-squared: 0.3573, Adjusted R-squared: 0.3412 F-statistic: 22.24 on 13 and 520 DF, p-value: < 2.2e-16 > > lm6 = lm(log(cps$wage) ~ . -age -race -occupation -marr -sector, data = cps) > summary(lm6) Call: lm(formula = log(cps$wage) ~ . - age - race - occupation - marr - sector, data = cps) Residuals: Min 1Q Median 3Q Max -2.13809 -0.28681 -0.00078 0.29376 1.96678 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.731792 0.122217 5.988 3.94e-09 *** education 0.094096 0.007942 11.848 < 2e-16 *** south -0.111761 0.042857 -2.608 0.009372 ** sex1 -0.231978 0.039202 -5.918 5.88e-09 *** experience 0.011548 0.001680 6.875 1.75e-11 *** union1 0.198360 0.051243 3.871 0.000122 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.4433 on 528 degrees of freedom Multiple R-squared: 0.3011, Adjusted R-squared: 0.2944 F-statistic: 45.49 on 5 and 528 DF, p-value: < 2.2e-16 >