From list to sequences

We start from the “aucplaces” dataset which contains several useful synthetic variables * the total number of degree obtained * the highest degree obtained * the main field of study (corresponding to the most specialized or highest degree obtaied)

Load packages

library(tidyverse)
library(networkD3)
library(htmlwidgets)

Load data

library(readr)
aucplaces <- read_csv("~/Places AUC/aucplaces.csv")
aucplaces


Workflow 1. Remove honorary degrees 2. Filter students with more than one degree 3. Identify duplicated years (simultaneous degrees) 4. Rank universities by order of succession

  1. Remove honorary degrees
aucplaces %>% group_by(Honorary) %>% count() # 8 honorary degrees
auc_rank <- aucplaces %>% filter(is.na(Honorary)) # 674 degrees remained


(2) Filter students with more than one degree

auc_rank <- auc_rank %>% group_by(Name_eng) %>% add_tally()
auc_rank <- auc_rank %>% filter(n>1) # 452 curricula remain
auc_rank <- auc_rank %>% filter(!Name_eng %in% c("Fan_Robert", "Lee_Poy Gum")) # obtained only 1 degree (false duplicate)
auc_rank %>% group_by(Name_eng) %>% count() # 195 unique individuals earned at least 2 degrees (up to 5)


(3) Identify duplicated years (simultaneous degrees)

# 35 individuals earned 2 or more (up to 4) degrees the same year 
multiple_rank <- auc_rank %>% group_by(Name_eng, Year_start) %>% count()
multiple_rank_2  <- multiple_rank %>% filter(n>1)


(4) Rank university by order of succession

# filter out multiple rank  
auc_rank_filtered <- auc_rank %>% filter(!Name_eng %in% multiple_rank_2$Name_eng)
auc_rank_filtered %>% group_by(Name_eng, Year_start) %>% count(sort = TRUE)
# rank degrees 
auc_rank_filtered <- auc_rank_filtered %>% mutate(rank = rank(Year_start)) %>% 
  relocate(rank, .before = "Year_start")


Export the dataset “auc_rank” to solve conflicts

write.csv(auc_rank, "auc_rank.csv")


Reimport the manually curated dataset

library(readr)
auc_rank_solved <- read_csv("~/Places AUC/Sequence/auc_rank_solved.csv")
auc_rank_solved


Bind with filtered data

auc_rank_all <- bind_rows(auc_rank_filtered, auc_rank_solved) # 811 degrees 
auc_rank_all %>% group_by(Name_eng) %>% count(sort = TRUE) # 362 individuals, from 2 to 6 degrees
auc_rank_all


We can already use this to analyze the first/last universities attended.

Pivot wider to visualize the sequence of universities and/or disciplines, etc:

auc_rank_to_pivot <- auc_rank_all %>% select(Name_eng, University, rank) 
auc_rank_pivot <- auc_rank_to_pivot %>%
  pivot_wider(names_from = rank, names_glue = "{.value}_{rank}", values_from = University)
auc_sequence <- auc_rank_pivot %>% relocate("University_1", .before = "University_2") %>%
  mutate_all(as.character)
auc_sequence %>% arrange(University_1, University_2)


Join with place number, period and main field

library(readr)
places_attributes <- read_csv("~/Places AUC/places_attributes.csv")

auc_attrib <- aucplaces %>% select(Name_eng, Name_zh, Field_high, period) %>% unique()
place_attrib <- places_attributes %>% select(Name_eng, PlaceLabel)  %>% unique()

auc_sequence <- left_join(auc_sequence, auc_attrib)
auc_sequence <- left_join(auc_sequence, place_attrib)

auc_sequence_df <- auc_sequence %>%
  mutate_all(as.character)


Export for further cleaning

write.csv(auc_sequence_df, "auc_sequence_df.csv")


Re-import clean dataset

auc_sequence_clean <- read_csv("~/Places AUC/Sequence/auc_sequence_clean.csv")


Join with employment data

auc_job <- aucplaces %>% select(Name_eng, Employer_main, Sector_1, Sector_2, start, end) %>% distinct()
auc_sequence_job <- left_join(auc_sequence_clean, auc_job)
auc_sequence_job

From sequences to paths

Select two first degrees

# two first degrees 
auc_seq_2 <- auc_sequence_job  %>% select('1', '2') %>%
  rename(source = '1', target = '2')


Remove missing values and compute link weight

links <- auc_seq_2 %>%
  group_by(source, target) %>% 
  drop_na(target)  %>% 
  count() %>% 
  rename(value = "n")

links %>% arrange(desc(value))


Export table of links

write.csv(links, "deg-1-2.csv")


Focus on migrations (remove students who stayed in the same college)

link <- links[links$source != links$target, ]


Focus on most important flows (n>1)

link <- link %>% filter(value>1)


Create nodes

node <- data.frame(
  name=c(as.character(link$source), 
         as.character(link$target)) %>% unique()
)


Create unique id for each connection

link$IDsource <- match(link$source, node$name)-1 
link$IDtarget <- match(link$target, node$name)-1


Build the networks

p <- sankeyNetwork(Links = link, Nodes = node,
                    Source = "IDsource", Target = "IDtarget",
                    Value = "value", NodeID = "name", 
                    sinksRight=FALSE)
p


Save the widget

library(htmlwidgets)
saveWidget(p, file=paste0( getwd(), "/deg1-2.html"))


We can apply the same code for the next degrees (2 to 3, 3 to 4…) for the students who attended more than 2 universities.

From college to work

First identify and remove students/colleges duplicates

auc_pair <- aucplaces %>% select(Name_eng, University, Employer_main, Sector_1, Sector_2) %>% unique()


Join with places number

auc_pair <- left_join(place_attrib, auc_pair)
auc_pair


Select university and employer to create a list of links

university_employer <- auc_pair %>% select(University, Employer_main)


Create and save list of links

links <- university_employer %>%
  rename(source = University, target = Employer_main) %>%
  group_by(source, target) %>% 
  drop_na(target) %>% 
  count() %>% 
  rename(value = "n")

links %>% arrange(desc(value))
write.csv(links, "univ_employer.csv")


Focus on the most important employers

link <- links %>% filter(value>1)


Create node list (contains every entity involved in the flow)

node <- data.frame(
  name=c(as.character(link$source), 
         as.character(link$target)) %>% unique()
)


Create unique id for each connection

link$IDsource <- match(link$source, node$name)-1 
link$IDtarget <- match(link$target, node$name)-1


Build the network

p <- sankeyNetwork(Links = link, Nodes = node,
                    Source = "IDsource", Target = "IDtarget",
                    Value = "value", NodeID = "name", 
                    sinksRight=FALSE)
p


Save the widget

library(htmlwidgets)
saveWidget(p, file=paste0( getwd(), "/employer.html"))