principal_component_analysis
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| principal_component_analysis [2019/11/14 18:05] – hkimscil | principal_component_analysis [2019/11/16 06:06] (current) – [e.g. saq] hkimscil | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| ====== PCA ====== | ====== PCA ====== | ||
| + | [[https:// | ||
| + | * Both are data reduction techniques — they allow you to capture the variance in variables in a smaller set. | ||
| + | * Both are usually run in stat software using the same procedure, and the output looks pretty much the same. | ||
| + | * The steps you take to run them are the same-extraction, | ||
| + | * Despite all these similarities, | ||
| + | ====== Some useful lectures ====== | ||
| + | |||
| {{youtube> | {{youtube> | ||
| - | {{youtube> | + | <WRAP clear /> |
| + | {{youtube> | ||
| + | <WRAP clear /> | ||
| + | {{youtube> | ||
| + | <WRAP clear /> | ||
| + | |||
| + | {{pca_gsp.csv}} | ||
| + | Od8gfNOOS9o | ||
| + | |||
| + | < | ||
| + | ## data.matrix | ||
| + | ## columns are individual samples (i.e. cells) | ||
| + | ## rows are measurements taken for all the samples (i.e. genes) | ||
| + | ## Just for the sake of the example, here's some made up data... | ||
| + | data.matrix <- matrix(nrow=100, | ||
| + | colnames(data.matrix) <- c( | ||
| + | paste(" | ||
| + | paste(" | ||
| + | rownames(data.matrix) <- paste(" | ||
| + | for (i in 1:100) { | ||
| + | wt.values <- rpois(5, lambda=sample(x=10: | ||
| + | ko.values <- rpois(5, lambda=sample(x=10: | ||
| + | |||
| + | data.matrix[i, | ||
| + | } | ||
| + | head(data.matrix) | ||
| + | dim(data.matrix) | ||
| + | |||
| + | pca <- prcomp(t(data.matrix), | ||
| + | |||
| + | ## plot pc1 and pc2 | ||
| + | plot(pca$x[, | ||
| + | |||
| + | ## make a scree plot | ||
| + | pca.var <- pca$sdev^2 | ||
| + | pca.var.per <- round(pca.var/ | ||
| + | |||
| + | barplot(pca.var.per, | ||
| + | |||
| + | ## now make a fancy looking plot that shows the PCs and the variation: | ||
| + | library(ggplot2) | ||
| + | |||
| + | pca.data <- data.frame(Sample=rownames(pca$x), | ||
| + | X=pca$x[, | ||
| + | Y=pca$x[, | ||
| + | pca.data | ||
| + | |||
| + | ggplot(data=pca.data, | ||
| + | geom_text() + | ||
| + | xlab(paste(" | ||
| + | ylab(paste(" | ||
| + | theme_bw() + | ||
| + | ggtitle(" | ||
| + | |||
| + | ## get the name of the top 10 measurements (genes) that contribute | ||
| + | ## most to pc1. | ||
| + | loading_scores <- pca$rotation[, | ||
| + | gene_scores <- abs(loading_scores) ## get the magnitudes | ||
| + | gene_score_ranked <- sort(gene_scores, | ||
| + | top_10_genes <- names(gene_score_ranked[1: | ||
| + | |||
| + | top_10_genes ## show the names of the top 10 genes | ||
| + | |||
| + | pca$rotation[top_10_genes, | ||
| + | |||
| + | ####### | ||
| + | ## | ||
| + | ## NOTE: Everything that follow is just bonus stuff. | ||
| + | ## It simply demonstrates how to get the same | ||
| + | ## results using " | ||
| + | ## (Eigen Decomposition). | ||
| + | ## | ||
| + | ####### | ||
| + | |||
| + | ############################################ | ||
| + | ## | ||
| + | ## Now let's do the same thing with svd() | ||
| + | ## | ||
| + | ## svd() returns three things | ||
| + | ## v = the " | ||
| + | ## in other words, a matrix of loading scores | ||
| + | ## u = this is similar to the " | ||
| + | ## | ||
| + | ## You can spread it out by multiplying by " | ||
| + | ## d = this is similar to the " | ||
| + | ## | ||
| + | ## | ||
| + | ## For prcomp(), sdev = sqrt(var) = sqrt(ss(fit)/ | ||
| + | ## For svd(), d = sqrt(ss(fit)) | ||
| + | ## | ||
| + | ############################################ | ||
| + | |||
| + | svd.stuff <- svd(scale(t(data.matrix), | ||
| + | |||
| + | ## calculate the PCs | ||
| + | svd.data <- data.frame(Sample=colnames(data.matrix), | ||
| + | X=(svd.stuff$u[, | ||
| + | Y=(svd.stuff$u[, | ||
| + | svd.data | ||
| + | |||
| + | ## alternatively, | ||
| + | ## original data | ||
| + | svd.pcs <- t(t(svd.stuff$v) %*% t(scale(t(data.matrix), | ||
| + | svd.pcs[, | ||
| + | |||
| + | svd.df <- ncol(data.matrix) - 1 | ||
| + | svd.var <- svd.stuff$d^2 / svd.df | ||
| + | svd.var.per <- round(svd.var/ | ||
| + | |||
| + | ggplot(data=svd.data, | ||
| + | geom_text() + | ||
| + | xlab(paste(" | ||
| + | ylab(paste(" | ||
| + | theme_bw() + | ||
| + | ggtitle(" | ||
| + | |||
| + | ############################################ | ||
| + | ## | ||
| + | ## Now let's do the same thing with eigen() | ||
| + | ## | ||
| + | ## eigen() returns two things... | ||
| + | ## vectors = eigen vectors (vectors of loading scores) | ||
| + | ## NOTE: pcs = sum(loading scores * values for sample) | ||
| + | ## values = eigen values | ||
| + | ## | ||
| + | ############################################ | ||
| + | cov.mat <- cov(scale(t(data.matrix), | ||
| + | dim(cov.mat) | ||
| + | |||
| + | ## since the covariance matrix is symmetric, we can tell eigen() to just | ||
| + | ## work on the lower triangle with " | ||
| + | eigen.stuff <- eigen(cov.mat, | ||
| + | dim(eigen.stuff$vectors) | ||
| + | head(eigen.stuff$vectors[, | ||
| + | |||
| + | eigen.pcs <- t(t(eigen.stuff$vectors) %*% t(scale(t(data.matrix), | ||
| + | eigen.pcs[, | ||
| + | |||
| + | eigen.data <- data.frame(Sample=rownames(eigen.pcs), | ||
| + | X=(-1 * eigen.pcs[, | ||
| + | Y=eigen.pcs[, | ||
| + | eigen.data | ||
| + | |||
| + | eigen.var.per <- round(eigen.stuff$values/ | ||
| + | |||
| + | ggplot(data=eigen.data, | ||
| + | geom_text() + | ||
| + | xlab(paste(" | ||
| + | ylab(paste(" | ||
| + | theme_bw() + | ||
| + | ggtitle(" | ||
| + | </ | ||
| + | |||
| + | ====== e.g. saq ====== | ||
| + | SPSS Anxiety Questionnaire | ||
| + | {{: | ||
| + | |||
| + | |||
| + | |||
| + | < | ||
| + | # saq <- read.csv(" | ||
| + | saq8 <- read.csv(" | ||
| + | head(saq8) | ||
| + | saq8 <- saq8[c(-1)] | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | > round(cor(saq8), | ||
| + | stat_cry afraid_spss sd_excite nmare_pearson du_stat lexp_comp comp_hate good_math | ||
| + | stat_cry | ||
| + | afraid_spss | ||
| + | sd_excite | ||
| + | nmare_pearson | ||
| + | du_stat | ||
| + | lexp_comp | ||
| + | comp_hate | ||
| + | good_math | ||
| + | > | ||
| + | </ | ||
| + | |||
| + | < | ||
| + | install.packages(" | ||
| + | library(Hmisc) | ||
| + | saq8.rcorr <- rcorr(as.matrix(saq8)) | ||
| + | |||
| + | print(saq8.rcorr$r, | ||
| + | </ | ||
| + | |||
principal_component_analysis.1573754703.txt.gz · Last modified: by hkimscil
