User Tools

Site Tools


principal_component_analysis

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
principal_component_analysis [2019/11/15 03:05] hkimscilprincipal_component_analysis [2019/11/16 15:06] (current) – [e.g. saq] hkimscil
Line 1: Line 1:
 ====== PCA ====== ====== PCA ======
 +[[https://www.theanalysisfactor.com/the-fundamental-difference-between-principal-component-analysis-and-factor-analysis/|Difference between PCA and FA]] 
 +  * Both are data reduction techniques — they allow you to capture the variance in variables in a smaller set.
 +  * Both are usually run in stat software using the same procedure, and the output looks pretty much the same.
 +  * The steps you take to run them are the same-extraction, interpretation, rotation, choosing the number of factors or components.
 +  * Despite all these similarities, there is a fundamental difference between them: PCA is a linear combination of variables; Factor Analysis is a measurement model of a latent variable.
 +====== Some useful lectures ======
 +
 {{youtube>FgakZw6K1QQ}} {{youtube>FgakZw6K1QQ}}
-{{youtube>g-Hb26agBFg}}+<WRAP clear /> 
 +{{youtube>g-Hb26agBFg}}  
 +<WRAP clear /> 
 +{{youtube>0Jp4gsfOLMs}}  
 +<WRAP clear /> 
 + 
 +{{pca_gsp.csv}} 
 +Od8gfNOOS9o 
 + 
 +<code>## In this example, the data is in a matrix called 
 +## data.matrix 
 +## columns are individual samples (i.e. cells) 
 +## rows are measurements taken for all the samples (i.e. genes) 
 +## Just for the sake of the example, here's some made up data... 
 +data.matrix <- matrix(nrow=100, ncol=10) 
 +colnames(data.matrix) <- c( 
 +  paste("wt", 1:5, sep=""), 
 +  paste("ko", 1:5, sep="")) 
 +rownames(data.matrix) <- paste("gene", 1:100, sep=""
 +for (i in 1:100) { 
 +  wt.values <- rpois(5, lambda=sample(x=10:1000, size=1)) 
 +  ko.values <- rpois(5, lambda=sample(x=10:1000, size=1)) 
 +  
 +  data.matrix[i,] <- c(wt.values, ko.values) 
 +
 +head(data.matrix) 
 +dim(data.matrix) 
 +  
 +pca <- prcomp(t(data.matrix), scale=TRUE)  
 +  
 +## plot pc1 and pc2 
 +plot(pca$x[,1], pca$x[,2]) 
 +  
 +## make a scree plot 
 +pca.var <- pca$sdev^2 
 +pca.var.per <- round(pca.var/sum(pca.var)*100, 1) 
 +  
 +barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation"
 +  
 +## now make a fancy looking plot that shows the PCs and the variation: 
 +library(ggplot2) 
 +  
 +pca.data <- data.frame(Sample=rownames(pca$x), 
 +  X=pca$x[,1], 
 +  Y=pca$x[,2]) 
 +pca.data 
 +  
 +ggplot(data=pca.data, aes(x=X, y=Y, label=Sample)) + 
 +  geom_text() + 
 +  xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) + 
 +  ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) + 
 +  theme_bw() + 
 +  ggtitle("My PCA Graph"
 +  
 +## get the name of the top 10 measurements (genes) that contribute 
 +## most to pc1. 
 +loading_scores <- pca$rotation[,1] 
 +gene_scores <- abs(loading_scores) ## get the magnitudes 
 +gene_score_ranked <- sort(gene_scores, decreasing=TRUE) 
 +top_10_genes <- names(gene_score_ranked[1:10]) 
 +  
 +top_10_genes ## show the names of the top 10 genes 
 +  
 +pca$rotation[top_10_genes,1] ## show the scores (and +/- sign) 
 +  
 +####### 
 +## 
 +## NOTE: Everything that follow is just bonus stuff. 
 +## It simply demonstrates how to get the same 
 +## results using "svd()" (Singular Value Decomposition) or using "eigen()" 
 +## (Eigen Decomposition). 
 +## 
 +####### 
 +  
 +############################################ 
 +## 
 +## Now let's do the same thing with svd() 
 +## 
 +## svd() returns three things 
 +## v = the "rotation" that prcomp() returns, this is a matrix of eigenvectors 
 +##     in other words, a matrix of loading scores 
 +## u = this is similar to the "x" that prcomp() returns. In other words, 
 +##     sum(the rotation * the original data), but compressed to the unit vector 
 +##     You can spread it out by multiplying by "d" 
 +## d = this is similar to the "sdev" value that prcomp() returns (and thus 
 +##     related to the eigen values), but not 
 +##     scaled by sample size in an unbiased way (ie. 1/(n-1)). 
 +##     For prcomp(), sdev = sqrt(var) = sqrt(ss(fit)/(n-1)) 
 +##     For svd(), d = sqrt(ss(fit)) 
 +## 
 +############################################ 
 +  
 +svd.stuff <- svd(scale(t(data.matrix), center=TRUE)) 
 +  
 +## calculate the PCs 
 +svd.data <- data.frame(Sample=colnames(data.matrix), 
 +  X=(svd.stuff$u[,1] * svd.stuff$d[1]), 
 +  Y=(svd.stuff$u[,2] * svd.stuff$d[2])) 
 +svd.data 
 +  
 +## alternatively, we could compute the PCs with the eigen vectors and the 
 +## original data 
 +svd.pcs <- t(t(svd.stuff$v) %*% t(scale(t(data.matrix), center=TRUE))) 
 +svd.pcs[,1:2] ## the first to principal components 
 +  
 +svd.df <- ncol(data.matrix) - 1 
 +svd.var <- svd.stuff$d^2 / svd.df 
 +svd.var.per <- round(svd.var/sum(svd.var)*100, 1) 
 +  
 +ggplot(data=svd.data, aes(x=X, y=Y, label=Sample)) + 
 +  geom_text() + 
 +  xlab(paste("PC1 - ", svd.var.per[1], "%", sep="")) + 
 +  ylab(paste("PC2 - ", svd.var.per[2], "%", sep="")) + 
 +  theme_bw() + 
 +  ggtitle("svd(scale(t(data.matrix), center=TRUE)"
 +  
 +############################################ 
 +## 
 +## Now let's do the same thing with eigen() 
 +## 
 +## eigen() returns two things... 
 +## vectors = eigen vectors (vectors of loading scores) 
 +##           NOTE: pcs = sum(loading scores * values for sample) 
 +## values = eigen values 
 +## 
 +############################################ 
 +cov.mat <- cov(scale(t(data.matrix), center=TRUE)) 
 +dim(cov.mat) 
 +  
 +## since the covariance matrix is symmetric, we can tell eigen() to just 
 +## work on the lower triangle with "symmetric=TRUE" 
 +eigen.stuff <- eigen(cov.mat, symmetric=TRUE) 
 +dim(eigen.stuff$vectors) 
 +head(eigen.stuff$vectors[,1:2]) 
 +  
 +eigen.pcs <- t(t(eigen.stuff$vectors) %*% t(scale(t(data.matrix), center=TRUE))) 
 +eigen.pcs[,1:2] 
 +  
 +eigen.data <- data.frame(Sample=rownames(eigen.pcs), 
 +  X=(-1 * eigen.pcs[,1]), ## eigen() flips the X-axis in this case, so we flip it back 
 +  Y=eigen.pcs[,2]) ## X axis will be PC1, Y axis will be PC2 
 +eigen.data 
 +  
 +eigen.var.per <- round(eigen.stuff$values/sum(eigen.stuff$values)*100, 1) 
 +  
 +ggplot(data=eigen.data, aes(x=X, y=Y, label=Sample)) + 
 +  geom_text() + 
 +  xlab(paste("PC1 - ", eigen.var.per[1], "%", sep="")) + 
 +  ylab(paste("PC2 - ", eigen.var.per[2], "%", sep="")) + 
 +  theme_bw() + 
 +  ggtitle("eigen on cov(t(data.matrix))"
 +</code> 
 + 
 +====== e.g. saq ====== 
 +SPSS Anxiety Questionnaire 
 +{{:r:saq8.csv}} 
 + 
 + 
 + 
 +<code> 
 +# saq <- read.csv("http://commres.net/wiki/_media/r/saq.csv", header = T) 
 +saq8 <- read.csv("http://commres.net/wiki/_media/r/saq8.csv", header = T) 
 +head(saq8) 
 +saq8 <- saq8[c(-1)] 
 +</code> 
 + 
 +<code> 
 +> round(cor(saq8),3) 
 +              stat_cry afraid_spss sd_excite nmare_pearson du_stat lexp_comp comp_hate good_math 
 +stat_cry         1.000      -0.099    -0.337         0.436   0.402     0.217     0.305     0.331 
 +afraid_spss     -0.099       1.000     0.318        -0.112  -0.119    -0.074    -0.159    -0.050s 
 +sd_excite       -0.337       0.318     1.000        -0.380  -0.310    -0.227    -0.382    -0.259 
 +nmare_pearson    0.436      -0.112    -0.380         1.000   0.401     0.278     0.409     0.349 
 +du_stat          0.402      -0.119    -0.310         0.401   1.000     0.257     0.339     0.269 
 +lexp_comp        0.217      -0.074    -0.227         0.278   0.257     1.000     0.514     0.223 
 +comp_hate        0.305      -0.159    -0.382         0.409   0.339     0.514     1.000     0.297 
 +good_math        0.331      -0.050    -0.259         0.349   0.269     0.223     0.297     1.000 
 +>  
 +</code> 
 + 
 +<code> 
 +install.packages("Hmisc"
 +library(Hmisc) 
 +saq8.rcorr <- rcorr(as.matrix(saq8)) 
 +  
 +print(saq8.rcorr$r, digits = 3) 
 +</code> 
 + 
principal_component_analysis.txt · Last modified: 2019/11/16 15:06 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki