====== Lab 06 ====== ########################################################################## # You may cite these labs as follows: McFarland, Daniel, Solomon Messing, # Mike Nowak, and Sean Westwood. 2010. "Social Network Analysis # Labs in R." Stanford University. ########################################################################## ########################################################################## # LAB 6 - Blockmodeling Lab # The point of this lab is to introduce students to blockmodeling # techniques that call for a metric of structural equivalence, a method # and rationale for the selection of the number of positions, and then # a means of summary representation (mean cutoff and reduced graph # presentation). Students will be shown how to identify positions using # correlation as a metric of structural equivalence (euclidean distance # is used in earlier lab), and they will be taught how to identify more # isomorphic notions of role-position using the triad census. Last, the # lab calls upon the user to compare positional techniques and come up # with a rationale for why they settle on one over another. ########################################################################## # NOTE: if you have trouble because some packages are not installed, # see lab 1 for instructions on how to install all necessary packages. ### #1. SETUP ### library(igraph) library(sna) library(triads) library(psych) library(nFactors) library(NetCluster) # install.packages(c("sna", "NetCluster")) ### #2. LOADING AND FORMATTING DATA ### data(studentnets.M182, package = "NetData") # Reduce to non-zero edges and build a graph object m182_full_nonzero_edges <- subset(m182_full_data_frame, (friend_tie > 0 | social_tie > 0 | task_tie > 0)) head(m182_full_nonzero_edges) m182_full <- graph.data.frame(m182_full_nonzero_edges) summary(m182_full) # Create sub-graphs based on edge attributes m182_friend <- delete_edges(m182_full, E(m182_full)[E(m182_full)$friend_tie==0]) summary(m182_friend) m182_friend[] m182_social <- delete_edges(m182_full, E(m182_full)[E(m182_full)$social_tie==0]) summary(m182_social) m182_social[] m182_task <- delete_edges(m182_full, E(m182_full)[E(m182_full)$task_tie==0]) summary(m182_task) m182_task[] # Look at the plots for each sub-graph pdf("6.1_m182_studentnet_friend_social_task_plots.pdf", width = 10) par(mfrow = c(1,3)) friend_layout <- layout.fruchterman.reingold(m182_friend) plot(m182_friend, layout=friend_layout, main = "friend", edge.arrow.size=.5) social_layout <- layout.fruchterman.reingold(m182_social) plot(m182_social, layout=social_layout, main = "social", edge.arrow.size=.5) task_layout <- layout.fruchterman.reingold(m182_task) plot(m182_task, layout=task_layout, main = "task", edge.arrow.size=.5) dev.off() ### # 3. HIERARCHICAL CLUSTERING ON SOCIAL & TASK TIES ### # We'll use the "task" and "social" sub-graphs together as the # basis for our structural equivalence methods. First, we'll use # the task graph to generate an adjacency matrix. # # This matrix represents task interactions directed FROM the # row individual TO the column individual. m182_task_matrix_row_to_col <- get.adjacency(m182_task, attr='task_tie') m182_task_matrix_row_to_col # To operate on a binary graph, simply leave off the "attr" # parameter: m182_task_matrix_row_to_col_bin <- get.adjacency(m182_task) m182_task_matrix_row_to_col_bin # For this lab, we'll use the valued graph. The next step is to # concatenate it with its transpose in order to capture both # incoming and outgoing task interactions. m182_task_matrix_col_to_row <- t(as.matrix(m182_task_matrix_row_to_col)) m182_task_matrix_col_to_row m182_task_matrix <- rbind(m182_task_matrix_row_to_col, m182_task_matrix_col_to_row) m182_task_matrix # Next, we'll use the same procedure to add social-interaction # information. m182_social_matrix_row_to_col <- get.adjacency(m182_social, attr='social_tie') m182_social_matrix_row_to_col m182_social_matrix_row_to_col_bin <- get.adjacency(m182_social) m182_social_matrix_row_to_col_bin m182_social_matrix_col_to_row <- t(as.matrix(m182_social_matrix_row_to_col)) m182_social_matrix_col_to_row m182_social_matrix <- rbind(m182_social_matrix_row_to_col, m182_social_matrix_col_to_row) m182_social_matrix m182_task_social_matrix <- rbind(m182_task_matrix, m182_social_matrix) m182_task_social_matrix # Now we have a single 4n x n matrix that represents both in- and # out-directed task and social communication. From this, we can # generate an n x n correlation matrix that shows the degree of # structural equivalence of each actor in the network. m182_task_social_cors <- cor(as.matrix(m182_task_social_matrix)) m182_task_social_cors # To use correlation values in hierarchical NetCluster, they must # first be coerced into a "dissimilarity structure" using dist(). # We subtract the values from 1 so that they are all greater than # or equal to 0; thus, highly dissimilar (i.e., negatively # correlated) actors have higher values. dissimilarity <- 1 - m182_task_social_cors m182_task_social_dist <- as.dist(dissimilarity) m182_task_social_dist # Note that it is also possible to use dist() directly on the # matrix. However, since cor() looks at associations between # columns and dist() looks at associations between rows, it is # necessary to transpose the matrix first. # # A variety of distance metrics are available; Euclidean # is the default. #m182_task_social_dist <- dist(t(m182_task_social_matrix)) #m182_task_social_dist # hclust() performs a hierarchical agglomerative NetCluster # operation based on the values in the dissimilarity matrix # yielded by as.dist() above. The standard visualization is a # dendrogram. By default, hclust() agglomerates clusters via a # "complete linkakage" algorithm, determining cluster proximity # by looking at the distance of the two points across clusters # that are farthest away from one another. This can be changed via # the "method" parameter. pdf("6.2_m182_studentnet_social_hclust.pdf") m182_task_social_hclust <- hclust(m182_task_social_dist) plot(m182_task_social_hclust) dev.off() # cutree() allows us to use the output of hclust() to set # different numbers of clusters and assign vertices to clusters # as appropriate. For example: cutree(m182_task_social_hclust, k=2) # Now we'll try to figure out the number of clusters that best # describes the underlying data. To do this, we'll loop through # all of the possible numbers of clusters (1 through n, where n is # the number of actors in the network). For each solution # corresponding to a given number of clusters, we'll use cutree() # to assign the vertices to their respective clusters # corresponding to that solution. # # From this, we can generate a matrix of within- and between- # cluster correlations. Thus, when there is one cluster for each # vertex in the network, the cell values will be identical to the # observed correlation matrix, and when there is one cluster for # the whole network, the values will all be equal to the average # correlation across the observed matrix. # # We can then correlate each by-cluster matrix with the observed # correlation matrix to see how well the by-cluster matrix fits # the data. We'll store the correlation for each number of # clusters in a vector, which we can then plot. # First, we initialize a vector for storing the correlations and # set a variable for our number of vertices. clustered_observed_cors = vector() num_vertices = length(V(m182_task)) # Next, we loop through the different possible cluster # configurations, produce matrices of within- and between- # cluster correlations, and correlate these by-cluster matrices # with the observed correlation matrix. pdf("6.3_m182_studentnet_task_social_clustered_observed_corrs.pdf") clustered_observed_cors <-clustConfigurations(num_vertices,m182_task_social_hclust,m182_task_social_cors) clustered_observed_cors plot(clustered_observed_cors$correlations) dev.off() clustered_observed_cors$correlations # From a visual inspection of the correlation matrix, we can # decide on the proper number of clusters in this network. # For this network, we'll use 4. (Note that the 1-cluster # solution doesn't appear on the plot because its correlation # with the observed correlation matrix is undefined.) num_clusters = 4 clusters <- cutree(m182_task_social_hclust, k = num_clusters) clusters cluster_cor_mat <- clusterCorr(m182_task_social_cors, clusters) cluster_cor_mat # Let's look at the correlation between this cluster configuration # and the observed correlation matrix. This should match the # corresponding value from clustered_observed_cors above. gcor(cluster_cor_mat, m182_task_social_cors) ##################### # Questions: # (1) What rationale do you have for selecting the number of # clusters / positions that you do? ##################### ### NOTE ON DEDUCTIVE CLUSTERING # It's pretty straightforward, using the code above, to explore # your own deductive NetCluster. Simply supply your own cluster # vector, where the elements in the vector are in the same order # as the vertices in the matrix, and the values represent the # cluster to which each vertex belongs. # # For example, if you believed that actors 2, 7, and 8 formed one # group, actor 16 former another group, and everyone else formed # a third group, you could represent this as follows: deductive_clusters = c(1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3) # You could then examine the fitness of this cluster configuration # as follows: deductive_cluster_cor_mat <- generate_cluster_cor_mat( m182_task_social_cors, deductive_clusters) deductive_cluster_cor_mat gcor(deductive_cluster_cor_mat, m182_task_social_cors) ### END NOTE ON DEDUCTIVE CLUSTERING # Now we'll use the 4-cluster solution to generate blockmodels, # using the raw tie data from the underlying task and social # networks. # Task valued task_mean <- mean(as.matrix(m182_task_matrix_row_to_col)) task_mean task_valued_blockmodel <- blockmodel(as.matrix(m182_task_matrix_row_to_col), clusters) task_valued_blockmodel # Task binary task_density <- graph.density(m182_task) task_density task_binary_blockmodel <- blockmodel(as.matrix(m182_task_matrix_row_to_col_bin), clusters) task_binary_blockmodel # Social valued social_mean <- mean(as.matrix(m182_social_matrix_row_to_col)) social_mean social_valued_blockmodel <- blockmodel(as.matrix(m182_social_matrix_row_to_col), clusters) social_valued_blockmodel # Social binary social_density <- graph.density(m182_social) social_density social_binary_blockmodel <- blockmodel(as.matrix(m182_social_matrix_row_to_col_bin), clusters) social_binary_blockmodel # We can also permute the network to examine the within- and # between-cluster correlations. cluster_cor_mat_per <- permute_matrix(clusters, cluster_cor_mat) cluster_cor_mat_per ##################### # Questions: # (2) What is the story you get from viewing these clusters, # and their within and between cluster densities on task and # social interaction? What can you say about M182 from this? ##################### ### # 4. HIERARCHICAL CLUSTERING ON TRIAD CENSUS ### # triads package is outdated. Not available for update r versions. # - hkim # Another way to think about roles within a network is by looking # at the triads that each actor belongs to. We can then use # correlations between triad-type memberships to identify people # with similar roles regardless of the specific people with whom # they interact. # First, we'll generate an individual-level triad census of the # network using triadcensus() from the triads package. task_triads <- triadcensus(m182_task) task_triads # Next, we'll generate a matrix of correlations between actors # in the network based on their similarity in triad-type # membership. Note that the cor() function in R operates on # columns, not rows, so in order to get correlations between # the actors in the network we have to transpose it. m182_task_triad_cors <- cor(t(task_triads)) m182_task_triad_cors # As above, we can use the correlation matrix to generate a # dissimilarity structure, which we can then hierarchically # cluster into groups of similar people. dissimilarity <- 1 - m182_task_triad_cors m182_task_triad_dist <- as.dist(dissimilarity) m182_task_triad_dist m182_task_triad_hclust <- hclust(m182_task_triad_dist) pdf("6.4_m182_studentnet_task_triad_hclust.pdf") plot(m182_task_triad_hclust) dev.off() # As above, we'll loop through each possible cluster solution # and see how well they match the observed matrix of triad-type # correlations. clustered_observed_cors = vector() num_vertices = length(V(m182_task)) pdf("6.5_m182_studentnet_task_hclust_triad_corrs.pdf") clustered_observed_cors <-clustConfigurations(num_vertices,m182_task_triad_hclust,m182_task_triad_cors) dev.off() clustered_observed_cors # From a visual inspection of the data, we'll use a 3-cluster # solution (though a case could also be made for using 5.) num_clusters = 3 clusters <- cutree(m182_task_triad_hclust, k = num_clusters) clusters cluster_cor_mat <- clusterCorr (m182_task_triad_cors, clusters) cluster_cor_mat gcor(cluster_cor_mat, m182_task_triad_cors) # As before, we can use these clusters to run a blockmodel # analysis using the underlying tie data from the task network. # Task valued task_mean <- mean(m182_task_matrix_row_to_col) task_mean task_valued_blockmodel <- blockmodel(m182_task_matrix_row_to_col, clusters) task_valued_blockmodel # Task binary task_density <- graph.density(m182_task) task_density task_binary_blockmodel <- blockmodel(m182_task_matrix_row_to_col_bin, clusters) task_binary_blockmodel # Finally, we can try to get a sense of what our different # clusters represent by generating a cluster-by-triad-type matrix. # This is an m x n matrix, where m is the number of clusters and n # is the 36 possible triad types. Each cell is the average # number of the given triad type for each individual in the # cluster. cluster_triad_mat <- matrix(nrow=max(clusters), ncol=ncol(task_triads)) for (i in 1:max(clusters)) { for (j in 1:ncol(task_triads)) { cluster_triad_mat[i,j] <- mean(task_triads[which(clusters==i),j]) } } cluster_triad_mat ##################### # Questions: # (3) What does clustering of the triadic census afford us? # What roles do you see? Redo the initial blockmodel analysis # without social interaction (only task) and then compare to # this solution. Do they differ? # # Extra credit: Try running the triad census on task AND # social interaction separately and then correlating persons. # What result do you get? Is it different from our initial # blockmodel result? Show your code. ###################### ### # 5. FACTOR ANALYSIS ### # Note that although we are conducting a principal components # analysis (PCA), which is technically not exactly the same as # factor analysis, we will use the term "factor" to describe the # individual components in our PCA. # PCA is often used in network analysis as a form of detecting # individuals global positioning. We say "global" because these # clusters aren't defined on local cohesion but from the overall # pattern of ties individuals have with all others (structural # equivalence). Identifying the first two largest components that # organize the variance in tie patterns is one way of doing this. # We'll analyze the 4n x n matrix generated above. # First, we want to determine the ideal number of components # (factors) to extract. We'll do this by examining the eigenvalues # in a scree plot and examining how each number of factors stacks # up to a few proposed non-graphical solutions to selecting the # optimal number of components, available via the nFactors # package. ev <- eigen(cor(as.matrix(m182_task_social_matrix))) # get eigenvalues ap <- parallel(subject=nrow(m182_task_social_matrix), var=ncol(m182_task_social_matrix), rep=100,cent=.05) nS <- nScree(ev$values, ap$eigen$qevpea) pdf("6.6_m182_studentnet_task_social_pca_scree.pdf") plotnScree(nS) # To draw a line across the graph where eigenvalues are = 1, # use the following code: plotnScree(nS) abline(h=1) dev.off() # For more information on this procedure, please see # the references provided in the parallel() documentation # (type "?parallel" in the R command line with the package # loaded). # Now we'll run a principal components analysis on the matrix, # using the number of factors determined above (note this may not # be the same number as you get): pca_m182_task_social = principal(as.matrix(m182_task_social_matrix), nfactors=5, rotate="varimax") # Let's take a look at the results in the R terminal: pca_m182_task_social # You can see the standardized loadings for each factor for each # node. Note that R sometimes puts the factors in a funky order # (e.g. RC1, RC2, RC5, RC4, RC3) but all of the factors are there. # You can see that the SS loadings, proportion of variance # explained and cumulative variance explained is provided below. A # Chi Square test of the factors and various other statistics are # provided below. # Note that the eigenvalues can be accessed via the following # command: pca_m182_task_social$values # Now we will use the factor loadings to cluster and compare that # to our other NetCluster techniques, using dendrograms. # Take the distance based on Euclidian Distance m182_task_factor_dist = dist(pca_m182_task_social$loadings) # And cluster m182_task_factor_hclust <- hclust(m182_task_factor_dist) pdf("6.7_m182_studentnet_task_social_pca_hclust.pdf") plot(m182_task_factor_hclust) dev.off() # And compare to NetCluster based on correlations and triads: pdf("6.8_m182_task_cluster_by_correlation_PCA_Triads.pdf") par(mfrow = c(1,3)) plot(m182_task_social_hclust, main = "Correlation") plot(m182_task_factor_hclust, main = "PCA") plot(m182_task_triad_hclust, main = "Triads") dev.off() ##################### # Questions: # (4) How do the results across blockmodel techniques differ? # Why might you use one over the other? Why might you want to # run more than one in your analyses? ##################### ====== Lab 06: Step 4 deleted ====== ########################################################################## # You may cite these labs as follows: McFarland, Daniel, Solomon Messing, # Mike Nowak, and Sean Westwood. 2010. "Social Network Analysis # Labs in R." Stanford University. ########################################################################## ########################################################################## # LAB 6 - Blockmodeling Lab # The point of this lab is to introduce students to blockmodeling # techniques that call for a metric of structural equivalence, a method # and rationale for the selection of the number of positions, and then # a means of summary representation (mean cutoff and reduced graph # presentation). Students will be shown how to identify positions using # correlation as a metric of structural equivalence (euclidean distance # is used in earlier lab), and they will be taught how to identify more # isomorphic notions of role-position using the triad census. Last, the # lab calls upon the user to compare positional techniques and come up # with a rationale for why they settle on one over another. ########################################################################## # NOTE: if you have trouble because some packages are not installed, # see lab 1 for instructions on how to install all necessary packages. ### #1. SETUP ### library(igraph) library(sna) library(triads) library(psych) library(nFactors) library(NetCluster) ### #2. LOADING AND FORMATTING DATA ### data(studentnets.M182, package = "NetData") # Reduce to non-zero edges and build a graph object m182_full_nonzero_edges <- subset(m182_full_data_frame, (friend_tie > 0 | social_tie > 0 | task_tie > 0)) head(m182_full_nonzero_edges) m182_full <- graph.data.frame(m182_full_nonzero_edges) summary(m182_full) # Create sub-graphs based on edge attributes m182_friend <- delete_edges(m182_full, E(m182_full)[E(m182_full)$friend_tie==0]) summary(m182_friend) m182_social <- delete_edges(m182_full, E(m182_full)[E(m182_full)$social_tie==0]) summary(m182_social) m182_task <- delete_edges(m182_full, E(m182_full)[E(m182_full)$task_tie==0]) summary(m182_task) # Look at the plots for each sub-graph pdf("6.1_m182_studentnet_friend_social_task_plots.pdf", width = 10) par(mfrow = c(1,3)) friend_layout <- layout.fruchterman.reingold(m182_friend) plot(m182_friend, layout=friend_layout, main = "friend", edge.arrow.size=.5) social_layout <- layout.fruchterman.reingold(m182_social) plot(m182_social, layout=social_layout, main = "social", edge.arrow.size=.5) task_layout <- layout.fruchterman.reingold(m182_task) plot(m182_task, layout=task_layout, main = "task", edge.arrow.size=.5) dev.off() ### # 3. HIERARCHICAL CLUSTERING ON SOCIAL & TASK TIES ### # We'll use the "task" and "social" sub-graphs together as the # basis for our structural equivalence methods. First, we'll use # the task graph to generate an adjacency matrix. # # This matrix represents task interactions directed FROM the # row individual TO the column individual. m182_task_matrix_row_to_col <- get.adjacency(m182_task, attr='task_tie') m182_task_matrix_row_to_col # To operate on a binary graph, simply leave off the "attr" # parameter: m182_task_matrix_row_to_col_bin <- get.adjacency(m182_task) m182_task_matrix_row_to_col_bin # For this lab, we'll use the valued graph. The next step is to # concatenate it with its transpose in order to capture both # incoming and outgoing task interactions. m182_task_matrix_col_to_row <- t(as.matrix(m182_task_matrix_row_to_col)) m182_task_matrix_col_to_row m182_task_matrix <- rbind(m182_task_matrix_row_to_col, m182_task_matrix_col_to_row) m182_task_matrix # Next, we'll use the same procedure to add social-interaction # information. m182_social_matrix_row_to_col <- get.adjacency(m182_social, attr='social_tie') m182_social_matrix_row_to_col m182_social_matrix_row_to_col_bin <- get.adjacency(m182_social) m182_social_matrix_row_to_col_bin m182_social_matrix_col_to_row <- t(as.matrix(m182_social_matrix_row_to_col)) m182_social_matrix_col_to_row m182_social_matrix <- rbind(m182_social_matrix_row_to_col, m182_social_matrix_col_to_row) m182_social_matrix m182_task_social_matrix <- rbind(m182_task_matrix, m182_social_matrix) m182_task_social_matrix # Now we have a single 4n x n matrix that represents both in- and # out-directed task and social communication. From this, we can # generate an n x n correlation matrix that shows the degree of # structural equivalence of each actor in the network. m182_task_social_cors <- cor(as.matrix(m182_task_social_matrix)) m182_task_social_cors # To use correlation values in hierarchical NetCluster, they must # first be coerced into a "dissimilarity structure" using dist(). # We subtract the values from 1 so that they are all greater than # or equal to 0; thus, highly dissimilar (i.e., negatively # correlated) actors have higher values. dissimilarity <- 1 - m182_task_social_cors m182_task_social_dist <- as.dist(dissimilarity) m182_task_social_dist # Note that it is also possible to use dist() directly on the # matrix. However, since cor() looks at associations between # columns and dist() looks at associations between rows, it is # necessary to transpose the matrix first. # # A variety of distance metrics are available; Euclidean # is the default. #m182_task_social_dist <- dist(t(m182_task_social_matrix)) #m182_task_social_dist # hclust() performs a hierarchical agglomerative NetCluster # operation based on the values in the dissimilarity matrix # yielded by as.dist() above. The standard visualization is a # dendrogram. By default, hclust() agglomerates clusters via a # "complete linkakage" algorithm, determining cluster proximity # by looking at the distance of the two points across clusters # that are farthest away from one another. This can be changed via # the "method" parameter. pdf("6.2_m182_studentnet_social_hclust.pdf") m182_task_social_hclust <- hclust(m182_task_social_dist) plot(m182_task_social_hclust) dev.off() # cutree() allows us to use the output of hclust() to set # different numbers of clusters and assign vertices to clusters # as appropriate. For example: cutree(m182_task_social_hclust, k=2) # Now we'll try to figure out the number of clusters that best # describes the underlying data. To do this, we'll loop through # all of the possible numbers of clusters (1 through n, where n is # the number of actors in the network). For each solution # corresponding to a given number of clusters, we'll use cutree() # to assign the vertices to their respective clusters # corresponding to that solution. # # From this, we can generate a matrix of within- and between- # cluster correlations. Thus, when there is one cluster for each # vertex in the network, the cell values will be identical to the # observed correlation matrix, and when there is one cluster for # the whole network, the values will all be equal to the average # correlation across the observed matrix. # # We can then correlate each by-cluster matrix with the observed # correlation matrix to see how well the by-cluster matrix fits # the data. We'll store the correlation for each number of # clusters in a vector, which we can then plot. # First, we initialize a vector for storing the correlations and # set a variable for our number of vertices. clustered_observed_cors = vector() num_vertices = length(V(m182_task)) # Next, we loop through the different possible cluster # configurations, produce matrices of within- and between- # cluster correlations, and correlate these by-cluster matrices # with the observed correlation matrix. pdf("6.3_m182_studentnet_task_social_clustered_observed_corrs.pdf") clustered_observed_cors <-clustConfigurations(num_vertices,m182_task_social_hclust,m182_task_social_cors) clustered_observed_cors plot(clustered_observed_cors$correlations) dev.off() clustered_observed_cors$correlations # From a visual inspection of the correlation matrix, we can # decide on the proper number of clusters in this network. # For this network, we'll use 4. (Note that the 1-cluster # solution doesn't appear on the plot because its correlation # with the observed correlation matrix is undefined.) num_clusters = 4 clusters <- cutree(m182_task_social_hclust, k = num_clusters) clusters cluster_cor_mat <- clusterCorr(m182_task_social_cors, clusters) cluster_cor_mat # Let's look at the correlation between this cluster configuration # and the observed correlation matrix. This should match the # corresponding value from clustered_observed_cors above. gcor(cluster_cor_mat, m182_task_social_cors) ##################### # Questions: # (1) What rationale do you have for selecting the number of # clusters / positions that you do? ##################### ### NOTE ON DEDUCTIVE CLUSTERING # It's pretty straightforward, using the code above, to explore # your own deductive NetCluster. Simply supply your own cluster # vector, where the elements in the vector are in the same order # as the vertices in the matrix, and the values represent the # cluster to which each vertex belongs. # # For example, if you believed that actors 2, 7, and 8 formed one # group, actor 16 former another group, and everyone else formed # a third group, you could represent this as follows: deductive_clusters = c(1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3) # You could then examine the fitness of this cluster configuration # as follows: deductive_cluster_cor_mat <- generate_cluster_cor_mat( m182_task_social_cors, deductive_clusters) deductive_cluster_cor_mat gcor(deductive_cluster_cor_mat, m182_task_social_cors) ### END NOTE ON DEDUCTIVE CLUSTERING # Now we'll use the 4-cluster solution to generate blockmodels, # using the raw tie data from the underlying task and social # networks. # Task valued task_mean <- mean(as.matrix(m182_task_matrix_row_to_col)) task_mean task_valued_blockmodel <- blockmodel(as.matrix(m182_task_matrix_row_to_col), clusters) task_valued_blockmodel # Task binary task_density <- graph.density(m182_task) task_density task_binary_blockmodel <- blockmodel(as.matrix(m182_task_matrix_row_to_col_bin), clusters) task_binary_blockmodel # Social valued social_mean <- mean(as.matrix(m182_social_matrix_row_to_col)) social_mean social_valued_blockmodel <- blockmodel(as.matrix(m182_social_matrix_row_to_col), clusters) social_valued_blockmodel # Social binary social_density <- graph.density(m182_social) social_density social_binary_blockmodel <- blockmodel(as.matrix(m182_social_matrix_row_to_col_bin), clusters) social_binary_blockmodel # We can also permute the network to examine the within- and # between-cluster correlations. cluster_cor_mat_per <- permute_matrix(clusters, cluster_cor_mat) cluster_cor_mat_per ##################### # Questions: # (2) What is the story you get from viewing these clusters, # and their within and between cluster densities on task and # social interaction? What can you say about M182 from this? ##################### # # 4. deleted. # ##################### # Questions: # (3) What does clustering of the triadic census afford us? # What roles do you see? Redo the initial blockmodel analysis # without social interaction (only task) and then compare to # this solution. Do they differ? # # Extra credit: Try running the triad census on task AND # social interaction separately and then correlating persons. # What result do you get? Is it different from our initial # blockmodel result? Show your code. ###################### ### # 5. FACTOR ANALYSIS ### # Note that although we are conducting a principal components # analysis (PCA), which is technically not exactly the same as # factor analysis, we will use the term "factor" to describe the # individual components in our PCA. # PCA is often used in network analysis as a form of detecting # individuals global positioning. We say "global" because these # clusters aren't defined on local cohesion but from the overall # pattern of ties individuals have with all others (structural # equivalence). Identifying the first two largest components that # organize the variance in tie patterns is one way of doing this. # We'll analyze the 4n x n matrix generated above. # First, we want to determine the ideal number of components # (factors) to extract. We'll do this by examining the eigenvalues # in a scree plot and examining how each number of factors stacks # up to a few proposed non-graphical solutions to selecting the # optimal number of components, available via the nFactors # package. ev <- eigen(cor(as.matrix(m182_task_social_matrix))) # get eigenvalues ap <- parallel(subject=nrow(m182_task_social_matrix), var=ncol(m182_task_social_matrix), rep=100,cent=.05) nS <- nScree(ev$values, ap$eigen$qevpea) pdf("6.6_m182_studentnet_task_social_pca_scree.pdf") plotnScree(nS) # To draw a line across the graph where eigenvalues are = 1, # use the following code: plotnScree(nS) abline(h=1) dev.off() # For more information on this procedure, please see # the references provided in the parallel() documentation # (type "?parallel" in the R command line with the package # loaded). # Now we'll run a principal components analysis on the matrix, # using the number of factors determined above (note this may not # be the same number as you get): pca_m182_task_social = principal(as.matrix(m182_task_social_matrix), nfactors=5, rotate="varimax") # Let's take a look at the results in the R terminal: pca_m182_task_social # You can see the standardized loadings for each factor for each # node. Note that R sometimes puts the factors in a funky order # (e.g. RC1, RC2, RC5, RC4, RC3) but all of the factors are there. # You can see that the SS loadings, proportion of variance # explained and cumulative variance explained is provided below. A # Chi Square test of the factors and various other statistics are # provided below. # Note that the eigenvalues can be accessed via the following # command: pca_m182_task_social$values # Now we will use the factor loadings to cluster and compare that # to our other NetCluster techniques, using dendrograms. # Take the distance based on Euclidian Distance m182_task_factor_dist = dist(pca_m182_task_social$loadings) # And cluster m182_task_factor_hclust <- hclust(m182_task_factor_dist) pdf("6.7_m182_studentnet_task_social_pca_hclust.pdf") plot(m182_task_factor_hclust) dev.off() # And compare to NetCluster based on correlations and triads: pdf("6.8_m182_task_cluster_by_correlation_PCA_Triads.pdf") par(mfrow = c(1,3)) plot(m182_task_social_hclust, main = "Correlation") plot(m182_task_factor_hclust, main = "PCA") plot(m182_task_triad_hclust, main = "Triads") dev.off() ##################### # Questions: # (4) How do the results across blockmodel techniques differ? # Why might you use one over the other? Why might you want to # run more than one in your analyses? ##################### ====== no pdf ====== ########################################################################## # You may cite these labs as follows: McFarland, Daniel, Solomon Messing, # Mike Nowak, and Sean Westwood. 2010. "Social Network Analysis # Labs in R." Stanford University. ########################################################################## ########################################################################## # LAB 6 - Blockmodeling Lab # The point of this lab is to introduce students to blockmodeling # techniques that call for a metric of structural equivalence, a method # and rationale for the selection of the number of positions, and then # a means of summary representation (mean cutoff and reduced graph # presentation). Students will be shown how to identify positions using # correlation as a metric of structural equivalence (euclidean distance # is used in earlier lab), and they will be taught how to identify more # isomorphic notions of role-position using the triad census. Last, the # lab calls upon the user to compare positional techniques and come up # with a rationale for why they settle on one over another. ########################################################################## # NOTE: if you have trouble because some packages are not installed, # see lab 1 for instructions on how to install all necessary packages. ### #1. SETUP ### library(igraph) library(sna) library(triads) library(psych) library(nFactors) library(NetCluster) ### #2. LOADING AND FORMATTING DATA ### data(studentnets.M182, package = "NetData") # Reduce to non-zero edges and build a graph object m182_full_nonzero_edges <- subset(m182_full_data_frame, (friend_tie > 0 | social_tie > 0 | task_tie > 0)) head(m182_full_nonzero_edges) m182_full <- graph.data.frame(m182_full_nonzero_edges) summary(m182_full) # Create sub-graphs based on edge attributes m182_friend <- delete_edges(m182_full, E(m182_full)[E(m182_full)$friend_tie==0]) summary(m182_friend) m182_social <- delete_edges(m182_full, E(m182_full)[E(m182_full)$social_tie==0]) summary(m182_social) m182_task <- delete_edges(m182_full, E(m182_full)[E(m182_full)$task_tie==0]) summary(m182_task) # Look at the plots for each sub-graph # pdf("6.1_m182_studentnet_friend_social_task_plots.pdf", width = 10) par(mfrow = c(1,3)) friend_layout <- layout.fruchterman.reingold(m182_friend) plot(m182_friend, layout=friend_layout, main = "friend", edge.arrow.size=.5) social_layout <- layout.fruchterman.reingold(m182_social) plot(m182_social, layout=social_layout, main = "social", edge.arrow.size=.5) task_layout <- layout.fruchterman.reingold(m182_task) plot(m182_task, layout=task_layout, main = "task", edge.arrow.size=.5) # dev.off() # back to normal window pannel (no partition) par(mfrow = c(1,1)) ### # 3. HIERARCHICAL CLUSTERING ON SOCIAL & TASK TIES ### # We'll use the "task" and "social" sub-graphs together as the # basis for our structural equivalence methods. First, we'll use # the task graph to generate an adjacency matrix. # # This matrix represents task interactions directed FROM the # row individual TO the column individual. m182_task_matrix_row_to_col <- get.adjacency(m182_task, attr='task_tie') m182_task_matrix_row_to_col # To operate on a binary graph, simply leave off the "attr" # parameter: m182_task_matrix_row_to_col_bin <- get.adjacency(m182_task) m182_task_matrix_row_to_col_bin # For this lab, we'll use the valued graph. The next step is to # concatenate it with its transpose in order to capture both # incoming and outgoing task interactions. m182_task_matrix_col_to_row <- t(as.matrix(m182_task_matrix_row_to_col)) m182_task_matrix_col_to_row m182_task_matrix <- rbind(m182_task_matrix_row_to_col, m182_task_matrix_col_to_row) m182_task_matrix # Next, we'll use the same procedure to add social-interaction # information. m182_social_matrix_row_to_col <- get.adjacency(m182_social, attr='social_tie') m182_social_matrix_row_to_col m182_social_matrix_row_to_col_bin <- get.adjacency(m182_social) m182_social_matrix_row_to_col_bin m182_social_matrix_col_to_row <- t(as.matrix(m182_social_matrix_row_to_col)) m182_social_matrix_col_to_row m182_social_matrix <- rbind(m182_social_matrix_row_to_col, m182_social_matrix_col_to_row) m182_social_matrix m182_task_social_matrix <- rbind(m182_task_matrix, m182_social_matrix) m182_task_social_matrix # Now we have a single 4n x n matrix that represents both in- and # out-directed task and social communication. From this, we can # generate an n x n correlation matrix that shows the degree of # structural equivalence of each actor in the network. m182_task_social_cors <- cor(as.matrix(m182_task_social_matrix)) m182_task_social_cors # To use correlation values in hierarchical NetCluster, they must # first be coerced into a "dissimilarity structure" using dist(). # We subtract the values from 1 so that they are all greater than # or equal to 0; thus, highly dissimilar (i.e., negatively # correlated) actors have higher values. dissimilarity <- 1 - m182_task_social_cors m182_task_social_dist <- as.dist(dissimilarity) m182_task_social_dist # Note that it is also possible to use dist() directly on the # matrix. However, since cor() looks at associations between # columns and dist() looks at associations between rows, it is # necessary to transpose the matrix first. # # A variety of distance metrics are available; Euclidean # is the default. #m182_task_social_dist <- dist(t(m182_task_social_matrix)) #m182_task_social_dist # hclust() performs a hierarchical agglomerative NetCluster # operation based on the values in the dissimilarity matrix # yielded by as.dist() above. The standard visualization is a # dendrogram. By default, hclust() agglomerates clusters via a # "complete linkakage" algorithm, determining cluster proximity # by looking at the distance of the two points across clusters # that are farthest away from one another. This can be changed via # the "method" parameter. # pdf("6.2_m182_studentnet_social_hclust.pdf") m182_task_social_hclust <- hclust(m182_task_social_dist) plot(m182_task_social_hclust) # dev.off() # cutree() allows us to use the output of hclust() to set # different numbers of clusters and assign vertices to clusters # as appropriate. For example: cutree(m182_task_social_hclust, k=2) # Now we'll try to figure out the number of clusters that best # describes the underlying data. To do this, we'll loop through # all of the possible numbers of clusters (1 through n, where n is # the number of actors in the network). For each solution # corresponding to a given number of clusters, we'll use cutree() # to assign the vertices to their respective clusters # corresponding to that solution. # # From this, we can generate a matrix of within- and between- # cluster correlations. Thus, when there is one cluster for each # vertex in the network, the cell values will be identical to the # observed correlation matrix, and when there is one cluster for # the whole network, the values will all be equal to the average # correlation across the observed matrix. # # We can then correlate each by-cluster matrix with the observed # correlation matrix to see how well the by-cluster matrix fits # the data. We'll store the correlation for each number of # clusters in a vector, which we can then plot. # First, we initialize a vector for storing the correlations and # set a variable for our number of vertices. clustered_observed_cors = vector() num_vertices = length(V(m182_task)) # Next, we loop through the different possible cluster # configurations, produce matrices of within- and between- # cluster correlations, and correlate these by-cluster matrices # with the observed correlation matrix. # pdf("6.3_m182_studentnet_task_social_clustered_observed_corrs.pdf") clustered_observed_cors <-clustConfigurations(num_vertices,m182_task_social_hclust,m182_task_social_cors) clustered_observed_cors plot(clustered_observed_cors$correlations) # dev.off() clustered_observed_cors$correlations # From a visual inspection of the correlation matrix, we can # decide on the proper number of clusters in this network. # For this network, we'll use 4. (Note that the 1-cluster # solution doesn't appear on the plot because its correlation # with the observed correlation matrix is undefined.) num_clusters = 4 clusters <- cutree(m182_task_social_hclust, k = num_clusters) clusters cluster_cor_mat <- clusterCorr(m182_task_social_cors, clusters) cluster_cor_mat # Let's look at the correlation between this cluster configuration # and the observed correlation matrix. This should match the # corresponding value from clustered_observed_cors above. gcor(cluster_cor_mat, m182_task_social_cors) ##################### # Questions: # (1) What rationale do you have for selecting the number of # clusters / positions that you do? ##################### ### NOTE ON DEDUCTIVE CLUSTERING # It's pretty straightforward, using the code above, to explore # your own deductive NetCluster. Simply supply your own cluster # vector, where the elements in the vector are in the same order # as the vertices in the matrix, and the values represent the # cluster to which each vertex belongs. # # For example, if you believed that actors 2, 7, and 8 formed one # group, actor 16 former another group, and everyone else formed # a third group, you could represent this as follows: deductive_clusters = c(1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 3) # You could then examine the fitness of this cluster configuration # as follows: deductive_cluster_cor_mat <- generate_cluster_cor_mat( m182_task_social_cors, deductive_clusters) deductive_cluster_cor_mat gcor(deductive_cluster_cor_mat, m182_task_social_cors) ### END NOTE ON DEDUCTIVE CLUSTERING # Now we'll use the 4-cluster solution to generate blockmodels, # using the raw tie data from the underlying task and social # networks. # Task valued task_mean <- mean(as.matrix(m182_task_matrix_row_to_col)) task_mean task_valued_blockmodel <- blockmodel(as.matrix(m182_task_matrix_row_to_col), clusters) task_valued_blockmodel # Task binary task_density <- graph.density(m182_task) task_density task_binary_blockmodel <- blockmodel(as.matrix(m182_task_matrix_row_to_col_bin), clusters) task_binary_blockmodel # Social valued social_mean <- mean(as.matrix(m182_social_matrix_row_to_col)) social_mean social_valued_blockmodel <- blockmodel(as.matrix(m182_social_matrix_row_to_col), clusters) social_valued_blockmodel # Social binary social_density <- graph.density(m182_social) social_density social_binary_blockmodel <- blockmodel(as.matrix(m182_social_matrix_row_to_col_bin), clusters) social_binary_blockmodel # We can also permute the network to examine the within- and # between-cluster correlations. cluster_cor_mat_per <- permute_matrix(clusters, cluster_cor_mat) cluster_cor_mat_per ##################### # Questions: # (2) What is the story you get from viewing these clusters, # and their within and between cluster densities on task and # social interaction? What can you say about M182 from this? ##################### # # 4. deleted. # ##################### # Questions: # (3) What does clustering of the triadic census afford us? # What roles do you see? Redo the initial blockmodel analysis # without social interaction (only task) and then compare to # this solution. Do they differ? # # Extra credit: Try running the triad census on task AND # social interaction separately and then correlating persons. # What result do you get? Is it different from our initial # blockmodel result? Show your code. ###################### ### # 5. FACTOR ANALYSIS ### # Note that although we are conducting a principal components # analysis (PCA), which is technically not exactly the same as # factor analysis, we will use the term "factor" to describe the # individual components in our PCA. # PCA is often used in network analysis as a form of detecting # individuals global positioning. We say "global" because these # clusters aren't defined on local cohesion but from the overall # pattern of ties individuals have with all others (structural # equivalence). Identifying the first two largest components that # organize the variance in tie patterns is one way of doing this. # We'll analyze the 4n x n matrix generated above. # First, we want to determine the ideal number of components # (factors) to extract. We'll do this by examining the eigenvalues # in a scree plot and examining how each number of factors stacks # up to a few proposed non-graphical solutions to selecting the # optimal number of components, available via the nFactors # package. ev <- eigen(cor(as.matrix(m182_task_social_matrix))) # get eigenvalues ap <- parallel(subject=nrow(m182_task_social_matrix), var=ncol(m182_task_social_matrix), rep=100,cent=.05) nS <- nScree(ev$values, ap$eigen$qevpea) # pdf("6.6_m182_studentnet_task_social_pca_scree.pdf") plotnScree(nS) # To draw a line across the graph where eigenvalues are = 1, # use the following code: plotnScree(nS) abline(h=1) # dev.off() # For more information on this procedure, please see # the references provided in the parallel() documentation # (type "?parallel" in the R command line with the package # loaded). # Now we'll run a principal components analysis on the matrix, # using the number of factors determined above (note this may not # be the same number as you get): pca_m182_task_social = principal(as.matrix(m182_task_social_matrix), nfactors=5, rotate="varimax") # Let's take a look at the results in the R terminal: pca_m182_task_social # You can see the standardized loadings for each factor for each # node. Note that R sometimes puts the factors in a funky order # (e.g. RC1, RC2, RC5, RC4, RC3) but all of the factors are there. # You can see that the SS loadings, proportion of variance # explained and cumulative variance explained is provided below. A # Chi Square test of the factors and various other statistics are # provided below. # Note that the eigenvalues can be accessed via the following # command: pca_m182_task_social$values # Now we will use the factor loadings to cluster and compare that # to our other NetCluster techniques, using dendrograms. # Take the distance based on Euclidian Distance m182_task_factor_dist = dist(pca_m182_task_social$loadings) # And cluster m182_task_factor_hclust <- hclust(m182_task_factor_dist) # pdf("6.7_m182_studentnet_task_social_pca_hclust.pdf") plot(m182_task_factor_hclust) # dev.off() # And compare to NetCluster based on correlations and triads: # pdf("6.8_m182_task_cluster_by_correlation_PCA_Triads.pdf") par(mfrow = c(1,2)) plot(m182_task_social_hclust, main = "Correlation") plot(m182_task_factor_hclust, main = "PCA") # the below one that didn't work # plot(m182_task_triad_hclust, main = "Triads") # dev.off() ##################### # Questions: # (4) How do the results across blockmodel techniques differ? # Why might you use one over the other? Why might you want to # run more than one in your analyses? #####################