Lab 04

#####################
# LAB 4: Centrality #
#####################


# NOTE: if you have trouble because some packages are not installed, 
# see lab 1 for instructions on how to install all necessary packages.


#############################################################
# 
# Lab 4 
#
# The purpose of this lab is to acquire centrality measures, 
# to determine how they are interrelated, and to discern 
# what they mean. 
#
##############################################################
 

###
# 1. SETUP 
###
library(igraph)


###
# 2. LOAD DATA
###

# This lab uses SSL.dat (social interaction) and TSL.dat (task 
# interaction) from the S641 Semester 1 class in student_nets.
# The class is a biology 2 class at a public high school.

# load data:
data(studentnets.S641, package = "NetData")

# Reduce to non-zero edges and build a graph object
s641_full_nonzero_edges <- subset(s641_full_data_frame, (social_tie > 0 | task_tie > 0))
head(s641_full_nonzero_edges)

s641_full <- graph.data.frame(s641_full_nonzero_edges) 
summary(s641_full)

# Create sub-graphs based on edge attributes and remove isolates
s641_social <- delete.edges(s641_full, E(s641_full)[get.edge.attribute(s641_full,name = "social_tie")==0])
s641_social <- delete.vertices(s641_social, V(s641_social)[degree(s641_social)==0])
summary(s641_social)

s641_task <- delete.edges(s641_full, E(s641_full)[get.edge.attribute(s641_full,name = "task_tie")==0])
s641_task <- delete.vertices(s641_task, V(s641_task)[degree(s641_task)==0])
summary(s641_task)

# Look at the plots for each sub-graph
social_layout <- layout.fruchterman.reingold(s641_social)
plot(s641_social, layout=social_layout, edge.arrow.size=.5)

# Note: click on the graph and then use the drop down menu to 
# save any plot you like -- it will save as a pdf. 

task_layout <- layout.fruchterman.reingold(s641_task)
plot(s641_task, layout=task_layout, edge.arrow.size=.5)

# Question #1 - what can you say about network centralization from these graphs?


###
# 3. CALCULATE CENTRALITY MEASURES FOR SOCIAL
###

# Indegree centrality measures how many people direct social 
# talk to the individual.
indegree_social <- degree(s641_social, mode='in')
indegree_social

# Outdegree centrality measures how many people the actor directs 
# social talk to. 
outdegree_social <- degree(s641_social, mode='out')
outdegree_social

# Closeness is the mean geodesic distance between a given node and
# all other nodes with paths from the given node to the other
# node. This is close to being the mean shortest path, but 
# geodesic distances give higher values for more central nodes.
#
# In a directed network, we can think of in-closeness centrality
# as the average number of steps one would have to go through to
# get TO a given node FROM all other reachable nodes in the
# network. Out-closeness centrality, not surprisingly, measures
# the same thing with the directionality reversed.

# In-closeness centrality
incloseness_social <- closeness(s641_social, mode='in')
incloseness_social

# Out-closeness
outcloseness_social <- closeness(s641_social, mode='out')
outcloseness_social

# Betweenness centrality measures the number of shortest paths
# going through a specific vertex; it is returned by the 
# betweenness() function. (Recall that in the previous lab we used 
# a related measure called edge betweenness, which is returned by
# the edge.betweenness() function.)
betweenness_social <- betweenness(s641_social)
betweenness_social

# Eigenvector centrality gives greater weight to a node the more 
# it is connected to other highly connected nodes. A node
# connected to five high-scoring nodes will have higher 
# eigenvector centrality than a node connected to five low-scoring
# nodes. Thus, it is often interpreted as measuring a node's
# network importance.
# 
# In directed networks, there are 'In' and 'Out' versions. In
# information flow studies, for instance, In-Eigenvector scores
# would reflect which nodes are high on receiving information,
# while Out-Eigenvector scores would reflect which nodes are high
# on broadcasting information.
#
# For these data, we will simply symmetrize to generate an 
# undirected eigenvector centrality score.
#
# Note that, unlike the other centrality measures, evcent() 
# returns a complex object rather than a simple vector. Thus, 
# we need to first get the evcent() output and then select the 
# eigenvector scores from it.
s641_social_undirected <- as.undirected(s641_social, mode='collapse')
ev_obj_social <- evcent(s641_social_undirected)
eigen_social <- ev_obj_social$vector
eigen_social

#####
# Extra Credit - what code would you write in R 
# to get the directed versions of eigenvector centrality?
#####

# To get the summary table, we'll construct a data frame with 
# the vertices as rows and the centrality scores as columns.
# 
# Note that the vertex IDs are NOT the same as the first column
# of row numbers. This is because we previously removed isolates.
central_social <- data.frame(V(s641_social)$name, indegree_social, outdegree_social, incloseness_social, outcloseness_social, betweenness_social, eigen_social)
central_social 

# Now we'll examine the table to find the most central actors 
# according to the different measures we have. When looking at
# each of these measures, it's a good idea to have your plot on
# hand so you can sanity-check the results.
plot(s641_social, vertex.size=10, vertex.label=V(s641_social)$name,
edge.arrow.size = 0.5, layout=layout.fruchterman.reingold,main='Classroom S641 Social Talk')

# Show table sorted by decreasing indegree. The order() function 
# returns a vector in ascending order; the minus sign flips it 
# to be descending order. Top actors are 18, 22 and 16.
central_social[order(-central_social$indegree_social),] 

# Outdegree: 22, 18 and 19.
central_social[order(-central_social$outdegree_social),] 

# In-closeness: 11, 15 and 18. 
# NOTE: For some reason, this operation returns strange values;
# a visual inspection of the plot suggests that 11, 15, and 18
# are not central actors at all. This could be a bug.
central_social[order(-central_social$incloseness_social),] 

# Out-closeness: 22, 16, and 19
central_social[order(-central_social$outcloseness_social),] 

# Eigenvector: 18, 19, and 16
central_social[order(-central_social$eigen_social),] 

# let's make a plot or two with these summary statistics

# To visualize these data, we can create a barplot for each
# centrality measure. In all cases, the y-axis is the value of
# each category and the x-axis is the node number. 
barplot(central_social$indegree_social, names.arg=central_social$V.s641_social..name)
barplot(central_social$outdegree_social, names.arg=central_social$V.s641_social..name)
barplot(central_social$incloseness_social, names.arg=central_social$V.s641_social..name)
barplot(central_social$outcloseness_social, names.arg=central_social$V.s641_social..name)
barplot(central_social$betweenness_social, names.arg=central_social$V.s641_social..name)
barplot(central_social$eigen_social, names.arg=central_social$V.s641_social..name)

# Question #2 - What can we say about the social actors if we compare the bar plots? 
# Who seems to run the show in sociable affairs? Who seems to bridge sociable conversations? 


###
# 4. CORRELATIONS BETWEEN CENTRALITY MEASURES
###

# Now we'll compute correlations betwee the columns to determine
# how closely these measures of centrality are interrelated. 

# Generate a table of pairwise correlations.
cor(central_social[,2:7])

# INTERPRETATION:
#
# Indegree and outdegree are very closely correlated (rho = 0.95),
# indicating that social talk with others is reciprocated (i.e.,
# if you talk to others, they tend to talk back to you).
# 
# The same is not true of incloseness and outcloseness (rho = 
# 0.38), indicating that the closeness calculated from inbound
# paths is not strongly associated with with closeness from
# outbound paths.
# 
# In- and out-degree are highly correlated with eigenvector
# centrality, indicating that the students that talk the most to
# others (or, relatedly, are talked to the most by others) are
# also the ones that are connected to other highly connected
# students -- possibly indicating high density cliques around
# these individuals.
# 
# Betweennes shows the highest corelation with outdegree, follwed
# by indegree. In the case of this particular network, it seems
# that the individuals that talk to the most others are the
# likeliest to serve as bridges between the particular cliques
# (see, e.g., 22 in the plot).


###
# 5. REPEAT FOR TASK TALK
###

# Indegree
# We should have 20 entries, indicating 2 isolates. 
indegree_task <- degree(s641_task, mode='in')
indegree_task

# Outdegree
outdegree_task <- degree(s641_task, mode='out')
outdegree_task

# In-closeness
incloseness_task <- closeness(s641_task, mode='in')
incloseness_task

# Out-closeness
outcloseness_task <- closeness(s641_task, mode='out')
outcloseness_task

# Betweenness. Note that the closeness measures arent very high
# for node 22, but the betweenness is off the charts.
betweenness_task <- betweenness(s641_task)
betweenness_task

# Eigenvector
s641_task_undirected <- as.undirected(s641_task, mode='collapse')
ev_obj_task <- evcent(s641_task_undirected)
eigen_task <-ev_obj_task$vector
eigen_task

# Generate a data frame with all centrality values
central_task <- data.frame(V(s641_task)$name, indegree_task, outdegree_task, incloseness_task, outcloseness_task, betweenness_task, eigen_task)
central_task

# In-degree: 22, 18 and 17
central_task[order(-central_task$indegree_task),] 

# Outdegree: 22, 18 and 17
central_task[order(-central_task$outdegree_task),] 

# Incloseness: 22, 18 and 17
central_task[order(-central_task$incloseness_task),] 

# Outcloseness: 22, 18 and 17
central_task[order(-central_task$outcloseness_task),] 

# Eigenvector: 22, 18 and 17
central_task[order(-central_task$eigen_task),] 

# Look at barplots
barplot(central_task$indegree_task, names.arg=central_task$V.s641_task..name)
barplot(central_task$outdegree_task, names.arg=central_task$V.s641_task..name)
barplot(central_task$incloseness_task, names.arg=central_task$V.s641_task..name)
barplot(central_task$outcloseness_task, names.arg=central_task$V.s641_task..name)
barplot(central_task$betweenness_task, names.arg=central_task$V.s641_task..name)
barplot(central_task$eigen_task, names.arg=central_task$V.s641_task..name)

# Question #3 - What can we say about the social actors if we compare the bar plots? 
# Who seems to run the show in task affairs? Who seems to bridge task conversations? 


###
# 6. TASK/SOCIAL CORRELATIONS 
###

# Note that in order to do this, we need to either have no missing
# data or use pairwise complete observations.
#
# It would be nice if the centrality functions padded N/A or zero
# data for the isolates, because then the dimensions of the two
# matrices would be compatible. But right now we have 19 nodes for 
# social interaction and 20 nodes for task interaction. So first 
# we have to do some hacky R stuff to make them both have 22
# nodes.

# First, we'll extract the node names from the SSL data, using
# levels() because it's a factor and converting it to numbers so
# we can match with the TSL data. Then we'll repeat for TSL.
connectednodes_social = as.numeric(levels(central_social$V.s641_social..name))[central_social$V.s641_social..name]
connectednodes_task = as.numeric(levels(central_task$V.s641_task..name))[central_task$V.s641_task..name]

# Check that we did this correctly: SSL should have 19 nodes, and 
# TSL should have 20 nodes.
length(connectednodes_social) 
length(connectednodes_task) 

# Extract matches for each data set, take that subset and use
# columns 2 through 7 to create the correlation matrix. This 
# computes the correlations based only on the actors in both 
# graphs (18 in total).
cor(central_social[which(connectednodes_social %in% connectednodes_task),2:7], central_task[which(connectednodes_task %in% connectednodes_social),2:7])


# INTERPRETATION:
#
# eigen_task is correlated with betweenness_social (rho=0.83) and
# outdegree (rho=0.82), possibly because those who are
# important in talk on tasks also serve as bridges for talk on
# social issues and have many outbound ties.
#
# indegree_task and betweenness_social (rho=0.88), and
# outdegree_task and betweenness_social (rho=0.88) are correlated,
# possibly because the number of indegree and outdegree ties a
# node has with respect to task talk, the more they serve as a
# bridge on social talk.
#
# incloseness_task and incloseness_social (rho=0.86) are
# correlated, meaning that those who serve in shortest parths past
# on inbound ties are equivalent for both social talk and task
# talk, which seems to make sense given the betweenness
# correlations with network importance and degree between task and
# social talk more interpretations are possible as well.

# Question #4 - What can we infer about s641 from these results? 
# What sort of substantive story can we derive from it?