Differences

This shows you the differences between two versions of the page.

--- principal_component_analysis [2019/11/09 15:47] – created hkimscil
+++ principal_component_analysis [2019/11/16 15:06] (current) – [e.g. saq] hkimscil
@@ Line 1: / Line 1: @@
 ====== PCA ======
+[[https://www.theanalysisfactor.com/the-fundamental-difference-between-principal-component-analysis-and-factor-analysis/|Difference between PCA and FA]]
+  * Both are data reduction techniques — they allow you to capture the variance in variables in a smaller set.
+  * Both are usually run in stat software using the same procedure, and the output looks pretty much the same.
+  * The steps you take to run them are the same-extraction, interpretation, rotation, choosing the number of factors or components.
+  * Despite all these similarities, there is a fundamental difference between them: PCA is a linear combination of variables; Factor Analysis is a measurement model of a latent variable.
+====== Some useful lectures ======
 {{youtube>FgakZw6K1QQ}}
+<WRAP clear />
+{{youtube>g-Hb26agBFg}}
+<WRAP clear />
+{{youtube>0Jp4gsfOLMs}}
+<WRAP clear />
+{{pca_gsp.csv}}
+Od8gfNOOS9o
+<code>## In this example, the data is in a matrix called
+## data.matrix
+## columns are individual samples (i.e. cells)
+## rows are measurements taken for all the samples (i.e. genes)
+## Just for the sake of the example, here's some made up data...
+data.matrix <- matrix(nrow=100, ncol=10)
+colnames(data.matrix) <- c(
+  paste("wt", 1:5, sep=""),
+  paste("ko", 1:5, sep=""))
+rownames(data.matrix) <- paste("gene", 1:100, sep="")
+for (i in 1:100) {
+  wt.values <- rpois(5, lambda=sample(x=10:1000, size=1))
+  ko.values <- rpois(5, lambda=sample(x=10:1000, size=1))
+  data.matrix[i,] <- c(wt.values, ko.values)
+}
+head(data.matrix)
+dim(data.matrix)
+pca <- prcomp(t(data.matrix), scale=TRUE)
+## plot pc1 and pc2
+plot(pca$x[,1], pca$x[,2])
+## make a scree plot
+pca.var <- pca$sdev^2
+pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
+barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")
+## now make a fancy looking plot that shows the PCs and the variation:
+library(ggplot2)
+pca.data <- data.frame(Sample=rownames(pca$x),
+  X=pca$x[,1],
+  Y=pca$x[,2])
+pca.data
+ggplot(data=pca.data, aes(x=X, y=Y, label=Sample)) +
+  geom_text() +
+  xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
+  ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
+  theme_bw() +
+  ggtitle("My PCA Graph")
+## get the name of the top 10 measurements (genes) that contribute
+## most to pc1.
+loading_scores <- pca$rotation[,1]
+gene_scores <- abs(loading_scores) ## get the magnitudes
+gene_score_ranked <- sort(gene_scores, decreasing=TRUE)
+top_10_genes <- names(gene_score_ranked[1:10])
+top_10_genes ## show the names of the top 10 genes
+pca$rotation[top_10_genes,1] ## show the scores (and +/- sign)
+#######
+##
+## NOTE: Everything that follow is just bonus stuff.
+## It simply demonstrates how to get the same
+## results using "svd()" (Singular Value Decomposition) or using "eigen()"
+## (Eigen Decomposition).
+##
+#######
+############################################
+##
+## Now let's do the same thing with svd()
+##
+## svd() returns three things
+## v = the "rotation" that prcomp() returns, this is a matrix of eigenvectors
+##     in other words, a matrix of loading scores
+## u = this is similar to the "x" that prcomp() returns. In other words,
+##     sum(the rotation * the original data), but compressed to the unit vector
+##     You can spread it out by multiplying by "d"
+## d = this is similar to the "sdev" value that prcomp() returns (and thus
+##     related to the eigen values), but not
+##     scaled by sample size in an unbiased way (ie. 1/(n-1)).
+##     For prcomp(), sdev = sqrt(var) = sqrt(ss(fit)/(n-1))
+##     For svd(), d = sqrt(ss(fit))
+##
+############################################
+svd.stuff <- svd(scale(t(data.matrix), center=TRUE))
+## calculate the PCs
+svd.data <- data.frame(Sample=colnames(data.matrix),
+  X=(svd.stuff$u[,1] * svd.stuff$d[1]),
+  Y=(svd.stuff$u[,2] * svd.stuff$d[2]))
+svd.data
+## alternatively, we could compute the PCs with the eigen vectors and the
+## original data
+svd.pcs <- t(t(svd.stuff$v) %*% t(scale(t(data.matrix), center=TRUE)))
+svd.pcs[,1:2] ## the first to principal components
+svd.df <- ncol(data.matrix) - 1
+svd.var <- svd.stuff$d^2 / svd.df
+svd.var.per <- round(svd.var/sum(svd.var)*100, 1)
+ggplot(data=svd.data, aes(x=X, y=Y, label=Sample)) +
+  geom_text() +
+  xlab(paste("PC1 - ", svd.var.per[1], "%", sep="")) +
+  ylab(paste("PC2 - ", svd.var.per[2], "%", sep="")) +
+  theme_bw() +
+  ggtitle("svd(scale(t(data.matrix), center=TRUE)")
+############################################
+##
+## Now let's do the same thing with eigen()
+##
+## eigen() returns two things...
+## vectors = eigen vectors (vectors of loading scores)
+##           NOTE: pcs = sum(loading scores * values for sample)
+## values = eigen values
+##
+############################################
+cov.mat <- cov(scale(t(data.matrix), center=TRUE))
+dim(cov.mat)
+## since the covariance matrix is symmetric, we can tell eigen() to just
+## work on the lower triangle with "symmetric=TRUE"
+eigen.stuff <- eigen(cov.mat, symmetric=TRUE)
+dim(eigen.stuff$vectors)
+head(eigen.stuff$vectors[,1:2])
+eigen.pcs <- t(t(eigen.stuff$vectors) %*% t(scale(t(data.matrix), center=TRUE)))
+eigen.pcs[,1:2]
+eigen.data <- data.frame(Sample=rownames(eigen.pcs),
+  X=(-1 * eigen.pcs[,1]), ## eigen() flips the X-axis in this case, so we flip it back
+  Y=eigen.pcs[,2]) ## X axis will be PC1, Y axis will be PC2
+eigen.data
+eigen.var.per <- round(eigen.stuff$values/sum(eigen.stuff$values)*100, 1)
+ggplot(data=eigen.data, aes(x=X, y=Y, label=Sample)) +
+  geom_text() +
+  xlab(paste("PC1 - ", eigen.var.per[1], "%", sep="")) +
+  ylab(paste("PC2 - ", eigen.var.per[2], "%", sep="")) +
+  theme_bw() +
+  ggtitle("eigen on cov(t(data.matrix))")
+</code>
+====== e.g. saq ======
+SPSS Anxiety Questionnaire
+{{:r:saq8.csv}}
+<code>
+# saq <- read.csv("http://commres.net/wiki/_media/r/saq.csv", header = T)
+saq8 <- read.csv("http://commres.net/wiki/_media/r/saq8.csv", header = T)
+head(saq8)
+saq8 <- saq8[c(-1)]
+</code>
+<code>
+> round(cor(saq8),3)
+              stat_cry afraid_spss sd_excite nmare_pearson du_stat lexp_comp comp_hate good_math
+stat_cry         1.000      -0.099    -0.337         0.436   0.402     0.217     0.305     0.331
+afraid_spss     -0.099       1.000     0.318        -0.112  -0.119    -0.074    -0.159    -0.050s
+sd_excite       -0.337       0.318     1.000        -0.380  -0.310    -0.227    -0.382    -0.259
+nmare_pearson    0.436      -0.112    -0.380         1.000   0.401     0.278     0.409     0.349
+du_stat          0.402      -0.119    -0.310         0.401   1.000     0.257     0.339     0.269
+lexp_comp        0.217      -0.074    -0.227         0.278   0.257     1.000     0.514     0.223
+comp_hate        0.305      -0.159    -0.382         0.409   0.339     0.514     1.000     0.297
+good_math        0.331      -0.050    -0.259         0.349   0.269     0.223     0.297     1.000
+>
+</code>
+<code>
+install.packages("Hmisc")
+library(Hmisc)
+saq8.rcorr <- rcorr(as.matrix(saq8))
+print(saq8.rcorr$r, digits = 3)
+</code>