principal_component_analysis
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revisionNext revisionBoth sides next revision | ||
principal_component_analysis [2019/11/15 03:16] – hkimscil | principal_component_analysis [2019/11/15 22:28] – [e.g. saq] hkimscil | ||
---|---|---|---|
Line 1: | Line 1: | ||
====== PCA ====== | ====== PCA ====== | ||
+ | [[https:// | ||
+ | * Both are data reduction techniques — they allow you to capture the variance in variables in a smaller set. | ||
+ | * Both are usually run in stat software using the same procedure, and the output looks pretty much the same. | ||
+ | * The steps you take to run them are the same-extraction, | ||
+ | * Despite all these similarities, | ||
+ | ====== Some useful lectures ====== | ||
+ | |||
{{youtube> | {{youtube> | ||
- | {{youtube> | + | <WRAP clear /> |
+ | {{youtube> | ||
+ | <WRAP clear /> | ||
+ | {{youtube> | ||
+ | <WRAP clear /> | ||
{{pca_gsp.csv}} | {{pca_gsp.csv}} | ||
+ | Od8gfNOOS9o | ||
+ | |||
+ | < | ||
+ | ## data.matrix | ||
+ | ## columns are individual samples (i.e. cells) | ||
+ | ## rows are measurements taken for all the samples (i.e. genes) | ||
+ | ## Just for the sake of the example, here's some made up data... | ||
+ | data.matrix <- matrix(nrow=100, | ||
+ | colnames(data.matrix) <- c( | ||
+ | paste(" | ||
+ | paste(" | ||
+ | rownames(data.matrix) <- paste(" | ||
+ | for (i in 1:100) { | ||
+ | wt.values <- rpois(5, lambda=sample(x=10: | ||
+ | ko.values <- rpois(5, lambda=sample(x=10: | ||
+ | |||
+ | data.matrix[i, | ||
+ | } | ||
+ | head(data.matrix) | ||
+ | dim(data.matrix) | ||
+ | |||
+ | pca <- prcomp(t(data.matrix), | ||
+ | |||
+ | ## plot pc1 and pc2 | ||
+ | plot(pca$x[, | ||
+ | |||
+ | ## make a scree plot | ||
+ | pca.var <- pca$sdev^2 | ||
+ | pca.var.per <- round(pca.var/ | ||
+ | |||
+ | barplot(pca.var.per, | ||
+ | |||
+ | ## now make a fancy looking plot that shows the PCs and the variation: | ||
+ | library(ggplot2) | ||
+ | |||
+ | pca.data <- data.frame(Sample=rownames(pca$x), | ||
+ | X=pca$x[, | ||
+ | Y=pca$x[, | ||
+ | pca.data | ||
+ | |||
+ | ggplot(data=pca.data, | ||
+ | geom_text() + | ||
+ | xlab(paste(" | ||
+ | ylab(paste(" | ||
+ | theme_bw() + | ||
+ | ggtitle(" | ||
+ | |||
+ | ## get the name of the top 10 measurements (genes) that contribute | ||
+ | ## most to pc1. | ||
+ | loading_scores <- pca$rotation[, | ||
+ | gene_scores <- abs(loading_scores) ## get the magnitudes | ||
+ | gene_score_ranked <- sort(gene_scores, | ||
+ | top_10_genes <- names(gene_score_ranked[1: | ||
+ | |||
+ | top_10_genes ## show the names of the top 10 genes | ||
+ | |||
+ | pca$rotation[top_10_genes, | ||
+ | |||
+ | ####### | ||
+ | ## | ||
+ | ## NOTE: Everything that follow is just bonus stuff. | ||
+ | ## It simply demonstrates how to get the same | ||
+ | ## results using " | ||
+ | ## (Eigen Decomposition). | ||
+ | ## | ||
+ | ####### | ||
+ | |||
+ | ############################################ | ||
+ | ## | ||
+ | ## Now let's do the same thing with svd() | ||
+ | ## | ||
+ | ## svd() returns three things | ||
+ | ## v = the " | ||
+ | ## in other words, a matrix of loading scores | ||
+ | ## u = this is similar to the " | ||
+ | ## | ||
+ | ## You can spread it out by multiplying by " | ||
+ | ## d = this is similar to the " | ||
+ | ## | ||
+ | ## | ||
+ | ## For prcomp(), sdev = sqrt(var) = sqrt(ss(fit)/ | ||
+ | ## For svd(), d = sqrt(ss(fit)) | ||
+ | ## | ||
+ | ############################################ | ||
+ | |||
+ | svd.stuff <- svd(scale(t(data.matrix), | ||
+ | |||
+ | ## calculate the PCs | ||
+ | svd.data <- data.frame(Sample=colnames(data.matrix), | ||
+ | X=(svd.stuff$u[, | ||
+ | Y=(svd.stuff$u[, | ||
+ | svd.data | ||
+ | |||
+ | ## alternatively, | ||
+ | ## original data | ||
+ | svd.pcs <- t(t(svd.stuff$v) %*% t(scale(t(data.matrix), | ||
+ | svd.pcs[, | ||
+ | |||
+ | svd.df <- ncol(data.matrix) - 1 | ||
+ | svd.var <- svd.stuff$d^2 / svd.df | ||
+ | svd.var.per <- round(svd.var/ | ||
+ | |||
+ | ggplot(data=svd.data, | ||
+ | geom_text() + | ||
+ | xlab(paste(" | ||
+ | ylab(paste(" | ||
+ | theme_bw() + | ||
+ | ggtitle(" | ||
+ | |||
+ | ############################################ | ||
+ | ## | ||
+ | ## Now let's do the same thing with eigen() | ||
+ | ## | ||
+ | ## eigen() returns two things... | ||
+ | ## vectors = eigen vectors (vectors of loading scores) | ||
+ | ## NOTE: pcs = sum(loading scores * values for sample) | ||
+ | ## values = eigen values | ||
+ | ## | ||
+ | ############################################ | ||
+ | cov.mat <- cov(scale(t(data.matrix), | ||
+ | dim(cov.mat) | ||
+ | |||
+ | ## since the covariance matrix is symmetric, we can tell eigen() to just | ||
+ | ## work on the lower triangle with " | ||
+ | eigen.stuff <- eigen(cov.mat, | ||
+ | dim(eigen.stuff$vectors) | ||
+ | head(eigen.stuff$vectors[, | ||
+ | |||
+ | eigen.pcs <- t(t(eigen.stuff$vectors) %*% t(scale(t(data.matrix), | ||
+ | eigen.pcs[, | ||
+ | |||
+ | eigen.data <- data.frame(Sample=rownames(eigen.pcs), | ||
+ | X=(-1 * eigen.pcs[, | ||
+ | Y=eigen.pcs[, | ||
+ | eigen.data | ||
+ | |||
+ | eigen.var.per <- round(eigen.stuff$values/ | ||
+ | |||
+ | ggplot(data=eigen.data, | ||
+ | geom_text() + | ||
+ | xlab(paste(" | ||
+ | ylab(paste(" | ||
+ | theme_bw() + | ||
+ | ggtitle(" | ||
+ | </ | ||
+ | |||
+ | ====== e.g. saq ====== | ||
+ | SPSS Anxiety Questionnaire | ||
+ | {{: | ||
+ | |||
+ | |||
+ | |||
+ | < | ||
+ | </ | ||
+ |
principal_component_analysis.txt · Last modified: 2019/11/16 15:06 by hkimscil