User Tools

Site Tools


principal_component_analysis

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Next revision
Previous revision
principal_component_analysis [2019/11/09 15:17]
hkimscil created
principal_component_analysis [2019/11/16 14:36] (current)
hkimscil [e.g. saq]
Line 1: Line 1:
 ====== PCA ====== ====== PCA ======
 +[[https://​www.theanalysisfactor.com/​the-fundamental-difference-between-principal-component-analysis-and-factor-analysis/​|Difference between PCA and FA]] 
 +  * Both are data reduction techniques — they allow you to capture the variance in variables in a smaller set.
 +  * Both are usually run in stat software using the same procedure, and the output looks pretty much the same.
 +  * The steps you take to run them are the same-extraction,​ interpretation,​ rotation, choosing the number of factors or components.
 +  * Despite all these similarities,​ there is a fundamental difference between them: PCA is a linear combination of variables; Factor Analysis is a measurement model of a latent variable.
 +====== Some useful lectures ======
 +
 {{youtube>​FgakZw6K1QQ}} {{youtube>​FgakZw6K1QQ}}
 +<WRAP clear />
 +{{youtube>​g-Hb26agBFg}} ​
 +<WRAP clear />
 +{{youtube>​0Jp4gsfOLMs}} ​
 +<WRAP clear />
 +
 +{{pca_gsp.csv}}
 +Od8gfNOOS9o
 +
 +<​code>##​ In this example, the data is in a matrix called
 +## data.matrix
 +## columns are individual samples (i.e. cells)
 +## rows are measurements taken for all the samples (i.e. genes)
 +## Just for the sake of the example, here's some made up data...
 +data.matrix <- matrix(nrow=100,​ ncol=10)
 +colnames(data.matrix) <- c(
 +  paste("​wt",​ 1:5, sep=""​),​
 +  paste("​ko",​ 1:5, sep=""​))
 +rownames(data.matrix) <- paste("​gene",​ 1:100, sep=""​)
 +for (i in 1:100) {
 +  wt.values <- rpois(5, lambda=sample(x=10:​1000,​ size=1))
 +  ko.values <- rpois(5, lambda=sample(x=10:​1000,​ size=1))
 + 
 +  data.matrix[i,​] <- c(wt.values,​ ko.values)
 +}
 +head(data.matrix)
 +dim(data.matrix)
 + 
 +pca <- prcomp(t(data.matrix),​ scale=TRUE) ​
 + 
 +## plot pc1 and pc2
 +plot(pca$x[,​1],​ pca$x[,2])
 + 
 +## make a scree plot
 +pca.var <- pca$sdev^2
 +pca.var.per <- round(pca.var/​sum(pca.var)*100,​ 1)
 + 
 +barplot(pca.var.per,​ main="​Scree Plot", xlab="​Principal Component",​ ylab="​Percent Variation"​)
 + 
 +## now make a fancy looking plot that shows the PCs and the variation:
 +library(ggplot2)
 + 
 +pca.data <- data.frame(Sample=rownames(pca$x),​
 +  X=pca$x[,​1],​
 +  Y=pca$x[,​2])
 +pca.data
 + 
 +ggplot(data=pca.data,​ aes(x=X, y=Y, label=Sample)) +
 +  geom_text() +
 +  xlab(paste("​PC1 - ", pca.var.per[1],​ "​%",​ sep=""​)) +
 +  ylab(paste("​PC2 - ", pca.var.per[2],​ "​%",​ sep=""​)) +
 +  theme_bw() +
 +  ggtitle("​My PCA Graph"​)
 + 
 +## get the name of the top 10 measurements (genes) that contribute
 +## most to pc1.
 +loading_scores <- pca$rotation[,​1]
 +gene_scores <- abs(loading_scores) ## get the magnitudes
 +gene_score_ranked <- sort(gene_scores,​ decreasing=TRUE)
 +top_10_genes <- names(gene_score_ranked[1:​10])
 + 
 +top_10_genes ## show the names of the top 10 genes
 + 
 +pca$rotation[top_10_genes,​1] ## show the scores (and +/- sign)
 + 
 +#######
 +##
 +## NOTE: Everything that follow is just bonus stuff.
 +## It simply demonstrates how to get the same
 +## results using "​svd()"​ (Singular Value Decomposition) or using "​eigen()"​
 +## (Eigen Decomposition).
 +##
 +#######
 + 
 +############################################​
 +##
 +## Now let's do the same thing with svd()
 +##
 +## svd() returns three things
 +## v = the "​rotation"​ that prcomp() returns, this is a matrix of eigenvectors
 +##     in other words, a matrix of loading scores
 +## u = this is similar to the "​x"​ that prcomp() returns. In other words,
 +##     ​sum(the rotation * the original data), but compressed to the unit vector
 +##     You can spread it out by multiplying by "​d"​
 +## d = this is similar to the "​sdev"​ value that prcomp() returns (and thus
 +##     ​related to the eigen values), but not
 +##     ​scaled by sample size in an unbiased way (ie. 1/(n-1)).
 +##     For prcomp(), sdev = sqrt(var) = sqrt(ss(fit)/​(n-1))
 +##     For svd(), d = sqrt(ss(fit))
 +##
 +############################################​
 + 
 +svd.stuff <- svd(scale(t(data.matrix),​ center=TRUE))
 + 
 +## calculate the PCs
 +svd.data <- data.frame(Sample=colnames(data.matrix),​
 +  X=(svd.stuff$u[,​1] * svd.stuff$d[1]),​
 +  Y=(svd.stuff$u[,​2] * svd.stuff$d[2]))
 +svd.data
 + 
 +## alternatively,​ we could compute the PCs with the eigen vectors and the
 +## original data
 +svd.pcs <- t(t(svd.stuff$v) %*% t(scale(t(data.matrix),​ center=TRUE)))
 +svd.pcs[,​1:​2] ## the first to principal components
 + 
 +svd.df <- ncol(data.matrix) - 1
 +svd.var <- svd.stuff$d^2 / svd.df
 +svd.var.per <- round(svd.var/​sum(svd.var)*100,​ 1)
 + 
 +ggplot(data=svd.data,​ aes(x=X, y=Y, label=Sample)) +
 +  geom_text() +
 +  xlab(paste("​PC1 - ", svd.var.per[1],​ "​%",​ sep=""​)) +
 +  ylab(paste("​PC2 - ", svd.var.per[2],​ "​%",​ sep=""​)) +
 +  theme_bw() +
 +  ggtitle("​svd(scale(t(data.matrix),​ center=TRUE)"​)
 + 
 +############################################​
 +##
 +## Now let's do the same thing with eigen()
 +##
 +## eigen() returns two things...
 +## vectors = eigen vectors (vectors of loading scores)
 +##           NOTE: pcs = sum(loading scores * values for sample)
 +## values = eigen values
 +##
 +############################################​
 +cov.mat <- cov(scale(t(data.matrix),​ center=TRUE))
 +dim(cov.mat)
 + 
 +## since the covariance matrix is symmetric, we can tell eigen() to just
 +## work on the lower triangle with "​symmetric=TRUE"​
 +eigen.stuff <- eigen(cov.mat,​ symmetric=TRUE)
 +dim(eigen.stuff$vectors)
 +head(eigen.stuff$vectors[,​1:​2])
 + 
 +eigen.pcs <- t(t(eigen.stuff$vectors) %*% t(scale(t(data.matrix),​ center=TRUE)))
 +eigen.pcs[,​1:​2]
 + 
 +eigen.data <- data.frame(Sample=rownames(eigen.pcs),​
 +  X=(-1 * eigen.pcs[,​1]),​ ## eigen() flips the X-axis in this case, so we flip it back
 +  Y=eigen.pcs[,​2]) ## X axis will be PC1, Y axis will be PC2
 +eigen.data
 + 
 +eigen.var.per <- round(eigen.stuff$values/​sum(eigen.stuff$values)*100,​ 1)
 + 
 +ggplot(data=eigen.data,​ aes(x=X, y=Y, label=Sample)) +
 +  geom_text() +
 +  xlab(paste("​PC1 - ", eigen.var.per[1],​ "​%",​ sep=""​)) +
 +  ylab(paste("​PC2 - ", eigen.var.per[2],​ "​%",​ sep=""​)) +
 +  theme_bw() +
 +  ggtitle("​eigen on cov(t(data.matrix))"​)
 +</​code>​
 +
 +====== e.g. saq ======
 +SPSS Anxiety Questionnaire
 +{{:​r:​saq8.csv}}
 +
 +
 +
 +<​code>​
 +# saq <- read.csv("​http://​commres.net/​wiki/​_media/​r/​saq.csv",​ header = T)
 +saq8 <- read.csv("​http://​commres.net/​wiki/​_media/​r/​saq8.csv",​ header = T)
 +head(saq8)
 +saq8 <- saq8[c(-1)]
 +</​code>​
 +
 +<​code>​
 +> round(cor(saq8),​3)
 +              stat_cry afraid_spss sd_excite nmare_pearson du_stat lexp_comp comp_hate good_math
 +stat_cry ​        ​1.000 ​     -0.099 ​   -0.337 ​        ​0.436 ​  ​0.402 ​    ​0.217 ​    ​0.305 ​    0.331
 +afraid_spss ​    ​-0.099 ​      ​1.000 ​    ​0.318 ​       -0.112 ​ -0.119 ​   -0.074 ​   -0.159 ​   -0.050s
 +sd_excite ​      ​-0.337 ​      ​0.318 ​    ​1.000 ​       -0.380 ​ -0.310 ​   -0.227 ​   -0.382 ​   -0.259
 +nmare_pearson ​   0.436      -0.112 ​   -0.380 ​        ​1.000 ​  ​0.401 ​    ​0.278 ​    ​0.409 ​    0.349
 +du_stat ​         0.402      -0.119 ​   -0.310 ​        ​0.401 ​  ​1.000 ​    ​0.257 ​    ​0.339 ​    0.269
 +lexp_comp ​       0.217      -0.074 ​   -0.227 ​        ​0.278 ​  ​0.257 ​    ​1.000 ​    ​0.514 ​    0.223
 +comp_hate ​       0.305      -0.159 ​   -0.382 ​        ​0.409 ​  ​0.339 ​    ​0.514 ​    ​1.000 ​    0.297
 +good_math ​       0.331      -0.050 ​   -0.259 ​        ​0.349 ​  ​0.269 ​    ​0.223 ​    ​0.297 ​    1.000
 +
 +</​code>​
 +
 +<​code>​
 +install.packages("​Hmisc"​)
 +library(Hmisc)
 +saq8.rcorr <- rcorr(as.matrix(saq8))
 + 
 +print(saq8.rcorr$r,​ digits = 3)
 +</​code>​
 +
 +
principal_component_analysis.1573282071.txt.gz · Last modified: 2019/11/09 15:17 by hkimscil