principal_component_analysis
Table of Contents
PCA
- Both are data reduction techniques — they allow you to capture the variance in variables in a smaller set.
- Both are usually run in stat software using the same procedure, and the output looks pretty much the same.
- The steps you take to run them are the same-extraction, interpretation, rotation, choosing the number of factors or components.
- Despite all these similarities, there is a fundamental difference between them: PCA is a linear combination of variables; Factor Analysis is a measurement model of a latent variable.
Some useful lectures
pca_gsp.csv
Od8gfNOOS9o
## In this example, the data is in a matrix called ## data.matrix ## columns are individual samples (i.e. cells) ## rows are measurements taken for all the samples (i.e. genes) ## Just for the sake of the example, here's some made up data... data.matrix <- matrix(nrow=100, ncol=10) colnames(data.matrix) <- c( paste("wt", 1:5, sep=""), paste("ko", 1:5, sep="")) rownames(data.matrix) <- paste("gene", 1:100, sep="") for (i in 1:100) { wt.values <- rpois(5, lambda=sample(x=10:1000, size=1)) ko.values <- rpois(5, lambda=sample(x=10:1000, size=1)) data.matrix[i,] <- c(wt.values, ko.values) } head(data.matrix) dim(data.matrix) pca <- prcomp(t(data.matrix), scale=TRUE) ## plot pc1 and pc2 plot(pca$x[,1], pca$x[,2]) ## make a scree plot pca.var <- pca$sdev^2 pca.var.per <- round(pca.var/sum(pca.var)*100, 1) barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation") ## now make a fancy looking plot that shows the PCs and the variation: library(ggplot2) pca.data <- data.frame(Sample=rownames(pca$x), X=pca$x[,1], Y=pca$x[,2]) pca.data ggplot(data=pca.data, aes(x=X, y=Y, label=Sample)) + geom_text() + xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) + ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) + theme_bw() + ggtitle("My PCA Graph") ## get the name of the top 10 measurements (genes) that contribute ## most to pc1. loading_scores <- pca$rotation[,1] gene_scores <- abs(loading_scores) ## get the magnitudes gene_score_ranked <- sort(gene_scores, decreasing=TRUE) top_10_genes <- names(gene_score_ranked[1:10]) top_10_genes ## show the names of the top 10 genes pca$rotation[top_10_genes,1] ## show the scores (and +/- sign) ####### ## ## NOTE: Everything that follow is just bonus stuff. ## It simply demonstrates how to get the same ## results using "svd()" (Singular Value Decomposition) or using "eigen()" ## (Eigen Decomposition). ## ####### ############################################ ## ## Now let's do the same thing with svd() ## ## svd() returns three things ## v = the "rotation" that prcomp() returns, this is a matrix of eigenvectors ## in other words, a matrix of loading scores ## u = this is similar to the "x" that prcomp() returns. In other words, ## sum(the rotation * the original data), but compressed to the unit vector ## You can spread it out by multiplying by "d" ## d = this is similar to the "sdev" value that prcomp() returns (and thus ## related to the eigen values), but not ## scaled by sample size in an unbiased way (ie. 1/(n-1)). ## For prcomp(), sdev = sqrt(var) = sqrt(ss(fit)/(n-1)) ## For svd(), d = sqrt(ss(fit)) ## ############################################ svd.stuff <- svd(scale(t(data.matrix), center=TRUE)) ## calculate the PCs svd.data <- data.frame(Sample=colnames(data.matrix), X=(svd.stuff$u[,1] * svd.stuff$d[1]), Y=(svd.stuff$u[,2] * svd.stuff$d[2])) svd.data ## alternatively, we could compute the PCs with the eigen vectors and the ## original data svd.pcs <- t(t(svd.stuff$v) %*% t(scale(t(data.matrix), center=TRUE))) svd.pcs[,1:2] ## the first to principal components svd.df <- ncol(data.matrix) - 1 svd.var <- svd.stuff$d^2 / svd.df svd.var.per <- round(svd.var/sum(svd.var)*100, 1) ggplot(data=svd.data, aes(x=X, y=Y, label=Sample)) + geom_text() + xlab(paste("PC1 - ", svd.var.per[1], "%", sep="")) + ylab(paste("PC2 - ", svd.var.per[2], "%", sep="")) + theme_bw() + ggtitle("svd(scale(t(data.matrix), center=TRUE)") ############################################ ## ## Now let's do the same thing with eigen() ## ## eigen() returns two things... ## vectors = eigen vectors (vectors of loading scores) ## NOTE: pcs = sum(loading scores * values for sample) ## values = eigen values ## ############################################ cov.mat <- cov(scale(t(data.matrix), center=TRUE)) dim(cov.mat) ## since the covariance matrix is symmetric, we can tell eigen() to just ## work on the lower triangle with "symmetric=TRUE" eigen.stuff <- eigen(cov.mat, symmetric=TRUE) dim(eigen.stuff$vectors) head(eigen.stuff$vectors[,1:2]) eigen.pcs <- t(t(eigen.stuff$vectors) %*% t(scale(t(data.matrix), center=TRUE))) eigen.pcs[,1:2] eigen.data <- data.frame(Sample=rownames(eigen.pcs), X=(-1 * eigen.pcs[,1]), ## eigen() flips the X-axis in this case, so we flip it back Y=eigen.pcs[,2]) ## X axis will be PC1, Y axis will be PC2 eigen.data eigen.var.per <- round(eigen.stuff$values/sum(eigen.stuff$values)*100, 1) ggplot(data=eigen.data, aes(x=X, y=Y, label=Sample)) + geom_text() + xlab(paste("PC1 - ", eigen.var.per[1], "%", sep="")) + ylab(paste("PC2 - ", eigen.var.per[2], "%", sep="")) + theme_bw() + ggtitle("eigen on cov(t(data.matrix))")
e.g. saq
SPSS Anxiety Questionnaire
saq8.csv
# saq <- read.csv("http://commres.net/wiki/_media/r/saq.csv", header = T) saq8 <- read.csv("http://commres.net/wiki/_media/r/saq8.csv", header = T) head(saq8) saq8 <- saq8[c(-1)]
> round(cor(saq8),3) stat_cry afraid_spss sd_excite nmare_pearson du_stat lexp_comp comp_hate good_math stat_cry 1.000 -0.099 -0.337 0.436 0.402 0.217 0.305 0.331 afraid_spss -0.099 1.000 0.318 -0.112 -0.119 -0.074 -0.159 -0.050s sd_excite -0.337 0.318 1.000 -0.380 -0.310 -0.227 -0.382 -0.259 nmare_pearson 0.436 -0.112 -0.380 1.000 0.401 0.278 0.409 0.349 du_stat 0.402 -0.119 -0.310 0.401 1.000 0.257 0.339 0.269 lexp_comp 0.217 -0.074 -0.227 0.278 0.257 1.000 0.514 0.223 comp_hate 0.305 -0.159 -0.382 0.409 0.339 0.514 1.000 0.297 good_math 0.331 -0.050 -0.259 0.349 0.269 0.223 0.297 1.000 >
install.packages("Hmisc") library(Hmisc) saq8.rcorr <- rcorr(as.matrix(saq8)) print(saq8.rcorr$r, digits = 3)
principal_component_analysis.txt · Last modified: 2019/11/16 15:06 by hkimscil