Abstract
This document introduces various network analyses conducted on the Who’s Who of American returned students (Tsing Hua, 1917), including kinship networks and affiliation networks.
The first step is to load the edge list (representing kinship ties) and the node list (containing students’ and relatives’ attributes). For a detailed description of the kinship data and the method for extracting it from the directory, please refer to the relevant documentation.
# load packages
library(readr)
library(tidyverse)
library(igraph)
# load kinship ties (edge list)
library(readr)
kinship <- read_delim("data/kinship.csv",
delim = ";", escape_double = FALSE, trim_ws = TRUE)
# load kinship node list (to distinguish biography from relative)
kinship_nodes <- read_csv("data/kinship_nodes.csv")
head(kinship_nodes)
# create network
g <- graph_from_data_frame(d=kinship, directed = TRUE, vertices=(kinship_nodes))
g
## IGRAPH 119964f DN-- 376 216 --
## + attr: name (v/c), Type (v/c), DocId (v/c), RelationMain (e/c),
## | Relation2 (e/c)
## + edges from 119964f (vertex names):
## [1] 余日章 ->余日宣 兪慶恩 ->兪慶堯 卞壽孫 ->卞福孫
## [4] 張景芬 ->張福生 方伯樑 ->方伯麟 曹芳芸 ->曹錫庚
## [7] 朱兆莘 ->朱兆燮 朱復 ->朱維民 歐陽祺 ->歐陽庚
## [10] 沈寳善 ->沈寳勳 盧壽汶 ->盧壽澂 羅泮忱 ->羅忠在
## [13] 胡詒穀 ->胡詒芳 諶湛溪 ->諶石勳 謝學瀛 ->謝學濤
## [16] 謝恩隆 ->謝欣榮 鄒應藼 ->鄒應歡 鄺孫謀 ->鄺芹生
## [19] 鍾文邦 ->鍾文鰲 陳同壽 ->陳明壽 陳宗良 ->陳宗賢
## + ... omitted several edges
# components(g)
The kinship network contains 376 nodes and 216 ties,
representing 172 family components, with sizes ranging from 2 to 6
members in the case of Shi Zhaoji’s (施肇基) family.
In the next step, we visualize the kinship network.We use the color and shape of nodes to differentiate between the students who are the subject of a biography (red squares) and the relatives mentioned in their biographies (blue circles). The color of ties represent the different types of relations:
# Index nodes shape on nodes type
V(g)[kinship_nodes$Type == "ego"]$shape <- "square"
V(g)[kinship_nodes$Type == "relative"]$shape <- "circle"
V(g)[kinship_nodes$Type == "ego"]$color <- "red"
V(g)[kinship_nodes$Type == "relative"]$color <- "light blue"
# Index edge color on relation type
E(g)[kinship$RelationMain == "Father"]$color <- "grey"
E(g)[kinship$RelationMain == "Uncle"]$color <- "gold"
E(g)[kinship$RelationMain == "Brother"]$color <- "yellowgreen"
E(g)[kinship$RelationMain == "Spouse"]$color <- "light blue"
# plot with igraph
plot.igraph(g, vertex.size = 3,
vertex.label.color = "black",
vertex.label.cex = 0.3,
edge.width=0.5,
edge.arrow.size=0,
edge.curved=0.1,
main="Kinship Network of Early Liumei")
To improve legibility, we will remove dyads and isolated nodes by
selecting the largest components (size > 2):
# Get components
cl <- components(g)
# Extract membership data
family_components <- data.frame(cl$membership) %>%
rownames_to_column("name") %>%
rename (comp_no = cl.membership) %>%
group_by(comp_no) %>% add_tally() %>%
rename(size = n) %>%
relocate(name, .after = size)
# join with node attributes
family_components_attributes <- inner_join(family_components, kinship_nodes)
Remove isolated nodes
family_filtered <- lapply(seq_along(cl$csize)[cl$csize > 1], function(x)
V(g)$name[cl$membership %in% x])
subv <- unlist(family_filtered)
kin1 <- as.data.frame(subv) # convert into dataframe
g1 <- induced.subgraph(graph=g,vids=subv)
# g1 has 371 nodes and 213 ties
# plot reduced graphs
# index nodes shape/color on nodes type
plot.igraph(g1, vertex.size = 3,
vertex.label.color = "black",
vertex.label.cex = 0.3,
edge.width=2,
edge.arrow.size=0,
edge.curved=0,
main="Kinship Network of Early Liumei (no isolates)")
# remove dyads
family_filtered2 <- lapply(seq_along(cl$csize)[cl$csize > 2], function(x)
V(g)$name[cl$membership %in% x])
subv2 <- unlist(family_filtered2)
kin2 <- as.data.frame(subv2) # convert into dataframe
write.csv(kin2, "kin2.csv")
g2 <- induced.subgraph(graph=g,vids=subv2)
# g2 has 83 nodes and 64 ties
plot.igraph(g2, vertex.size = 5,
vertex.label.color = "black",
vertex.label.cex = 0.5,
edge.width=2.5,
edge.arrow.size=0,
edge.curved=0,
main="Kinship Network of Early Liuei (cluster > 2)")
We upload the affiliation data retrieved using Named Entities Recognition (NER), as described in the Data Extraction Script.
# load packages
library(readr)
library(tidyverse)
# load data
library(readr)
affiliation_eng <- read_csv("data/affiliation_eng.csv")
head(affiliation_eng)
The dataset contains a total of 3,431 affiliations retrieved
from the 401 English biographies. For each affiliation, the table
provides the following information:
There is a total of 1,305 unique organizations, with frequency ranging from 1 to 96 (in the case of St John’s University). The educational sector represents the largest number (536, 40%), followed by associations (254, 19.5%), and media (newspapers, journals) (108, 8%). The largest number of affiliations refer to their period of studies in the US (1460, 42%) (mostly educational institutions, 810, 55%), followed by their post-return career (1154, 33.6%), their life prior to going to the US (803), and other (non-dated) affiliations.
# distribution of affiliations
affiliation_eng %>% group_by(Organization) %>% count(sort = TRUE) # distribution by organization
affiliation_eng %>% distinct(category_main, Organization) %>% group_by(category_main) %>% count(sort = TRUE) # distribution by category
affiliation_eng %>% group_by(Position) %>% count(sort = TRUE) # distribution by life period
To build the network, we need to create the edge list (list of links between students and organizations) and the node list (category of node, persons or organizations):
# create the edge list
affiliation_edge <- affiliation_eng %>%
mutate(DocName = paste(DocId, name, sep = "_")) %>% # we create a synthetic variable compiling the doc Id with the name, in case of homonyms (wife/husbands)
select(DocName, Organization, Position)
# create the node list
person_node <- affiliation_edge %>%
distinct(DocName) %>%
mutate(Type = "Person") %>%
mutate(Type2 = "Person") %>%
rename(Name = DocName)
org_node <- affiliation_eng %>%
distinct(Organization, category_main) %>%
mutate(Type = "Organization") %>%
rename(Name = Organization,
Type2 = category_main) %>%
select(Name, Type, Type2)
affiliation_node <- bind_rows(person_node, org_node)
Next, we create the bipartite network with igraph:
library(igraph)
# Creating a network from the edge list
Net <- graph_from_data_frame(affiliation_edge, directed = FALSE)
## Transformation into a 2-mode network
V(Net)$type <- bipartite_mapping(Net)$type
# Projection
projNet <- bipartite_projection(Net, multiplicity = TRUE)
Net1 <- projNet$proj1 # Network of persons
Net2 <- projNet$proj2 # Network of organizations
# assign color to type of node
V(Net)$shape <- ifelse(V(Net)$Type == "Person", "square","circle")
V(Net)$color <- ifelse(V(Net)$Type == "Person", "tomato","blue")
plot(Net, vertex.size = 3,
vertex.color = V(Net)$color,
vertex.shape = V(Net)$shape ,
vertex.label.color = "black",
vertex.label.cex = 0.3,
main="Early Liumei Affiliation Network")
# remove labels and make size proportionate to degree
layout <- layout_nicely(Net)
plot(Net, vertex.size = degree(Net)*0.15,
vertex.color = V(Net)$color,
vertex.shape = V(Net)$shape ,
vertex.label = NA,
layout = layout,
main="Early Liumei Affiliation Network")
Plot projections
plot(Net1, vertex.size = 3,
vertex.color = "orange",
vertex.label.color = "black",
vertex.label.cex = 0.3,
main="Network of persons linked by organizations")
plot(Net2, vertex.size = 3,
vertex.color = "steel blue",
vertex.label.color = "black",
vertex.label.cex = 0.3,
main="Network of organizations linked by persons")