Abstract
This document is part of a series of three scripts developed to support a comprehensive study of early Chinese PhDs from 1905 to 1962. The study is based on a dataset derived from three catalogs of Chinese doctoral dissertations compiled by the librarian and bibliographer Yuan Tongli 袁同禮 (1895–1965), which cover the United States (1905–1960), the United Kingdom (1916–1961), and Continental Europe (1905–1962). Following an initial script that mapped the dissertations and a second script that analyzed the authors’ social and educational backgrounds, this final script aims to trace their post-graduation careers—particularly after the Communist Revolution of 1949—using information from Wikipedia and Baidu.
In this third and final script, we analyze the post-graduation careers of Chinese PhDs using data from their biographies on Wikipedia and Baidu. The script proceeds in three steps:
The first section documents the methods employed for retrieving biographies from Wikipedia and Baidu. For Wikipedia - which is part of the Modern China Textbase (MCTB), we relied on HistText. For Baidu, which is not yet available in the MCTB, we employed web scraping techniques using Python (see the accompanying Python script).
library(histtext)
library(tidyverse)
# Create list of PhD names
doctors_list <- phd_all_master %>% distinct(ID, NameZH, Region, Discipline) %>% drop_na(NameZH) %>% mutate(Queries=str_glue('"{NameZH}"'))
# Create function for multiple queries
multiple_search <- function(queries, corpus) {
results <- histtext::search_documents_ex(queries[1], corpus) %>%
mutate(Q=queries[1])
for(q in queries){
new_result <- histtext::search_documents_ex(q, corpus) %>%
mutate(Q=q)
results <- dplyr::bind_rows(results, new_result)
}
distinct(results)
}
# Search the list in Wikipedia
doctors_wiki <- multiple_search(doctors_list$Queries, "wikibio-zh") # 28837 results
doctors_wiki$Name <- trimws(doctors_wiki$Title, whitespace = "\\s*\\(.*") # clean titles
# Compare names in the original list with biography titles
list_names <- doctors_list$NameZH
list_names <- paste(list_names, sep = "", collapse = "|")
doctors_wiki <- doctors_wiki %>% mutate(title_match = str_extract(Name, list_names))
doctors_wiki_match <- doctors_wiki %>% filter(!is.na(title_match)) # 2014 matches
# Compare names the original list with matches based on their length
doctors_wiki_match <- doctors_wiki_match %>% mutate(wiki_length = nchar(Name)) %>%
mutate(query_length = nchar(title_match)) %>% mutate(diff = (query_length-wiki_length))
doctors_wiki_filtered <- doctors_wiki_match %>% filter(diff == "0") # select exact matches only : 1005 names remain
doctor_IDQ <- doctors_wiki_filtered %>% select(DocId, Q) %>%
mutate(Queries = Q)
doctor_IDQ <- left_join(doctor_IDQ, doctors_list)
doctor_IDQ$title_match <- NULL
# Count number of occurrences for each name
doctors_wiki_filtered <- doctors_wiki_filtered %>% group_by(title_match) %>% add_tally()
doctors_wiki_filtered_unique <- doctors_wiki_filtered %>% distinct(DocId, Title, Name) # 665 unique bios
# Note that some individuals do not have a biography of their own, but they are mentioned in the biographies of others. For example, Hu Shi appears 19 times—indicating that 18 doctors (excluding Hu Shi himself) are mentioned in his biography.
# Extract digits from the biography titles as a proxy for identifying the year of birth
doctors_wiki_filtered_unique <- doctors_wiki_filtered_unique %>%
mutate(year = as.integer(str_extract(Title, "[0-9]+")))
# Remove individuals born after 1949 => 652 remain
doctors_wiki_filtered_unique1 <- doctors_wiki_filtered_unique %>% filter(is.na(year))
doctors_wiki_filtered_unique2 <- doctors_wiki_filtered_unique %>% filter(year < 1949)
doctors_wiki_filtered_unique <- bind_rows(doctors_wiki_filtered_unique1, doctors_wiki_filtered_unique2)
# Extract full text
doctors_wiki_ft <- histtext::get_documents(doctors_wiki_filtered_unique, "wikibio-zh")
# Extract year of birth
doctors_wiki_ft$year <- regmatches(doctors_wiki_ft$Text, gregexpr("\\d{4}", doctors_wiki_ft$Text))
doctors_wiki_ft$birth <- regmatches(doctors_wiki_ft$year, regexpr("[[:digit:]]+", doctors_wiki_ft$year))
doctors_wiki_ft$year <- NULL
# Filter individuals born between 1800 and 1942 => 370 remain
doctors_wiki_ft_filtered <- doctors_wiki_ft %>% filter(birth < 1942) %>% filter(birth > 1800)
# Remove those who died before 1905 (year of first phd) -> 361 remain
doctors_wiki_ft_filtered <- doctors_wiki_ft_filtered %>% filter(birth > 1827) %>% filter(!DocId == "700536")
doctors_wiki_ft_filtered$Name <- trimws(doctors_wiki_ft_filtered$Title, whitespace = "\\s*\\(.*") # clean titles
# Save dataset as csv file
write.csv(doctors_wiki_ft_filtered, "doctors_wiki_ft_filtered.csv")
# Imports necessary modules for data manipulation (pandas, json), scraping (Selenium, BeautifulSoup), and handling delays and regular expressions.
import pandas as pd
import json
import time
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Helper function (Cleans up HTML text): Collapses multiple whitespaces and Strips leading/trailing spaces.
def clean_text(text):
return re.sub(r'\s+', ' ', text).strip()
# Extracts side panel information from the Baike page: Image and description from the overview album; Statistics like views, edits, and last update; Contributors listed by username; Side catalog links for quick navigation
def extract_side_content(soup):
metadata = {}
side_div = soup.find("div", id="side")
if not side_div:
return {}
album = side_div.find('div', class_=lambda x: x and 'abstractAlbum_' in x)
if album:
img = album.find('img')
metadata['overview_image'] = img['src'] if img and img.has_attr('src') else None
description = album.find('div', class_=lambda x: x and 'albumInfo_' in x)
metadata['overview_description'] = clean_text(description.text) if description else None
# Stats
stats = side_div.find('div', class_=lambda x: x and 'lemmaStatistics_' in x)
if stats:
stat_data = {}
all_divs = stats.find_all('div')
for div in all_divs:
text = clean_text(div.get_text())
if "浏览次数" in text:
stat_data['views'] = text
elif "编辑次数" in text:
stat_data['edits'] = text
elif "最近更新" in text:
stat_data['last_update'] = text
if stat_data:
metadata['statistics'] = stat_data
# Contributors
contributors_div = side_div.find('div', id="J-contributor-list")
if contributors_div:
users = contributors_div.find_all('a', class_=lambda x: x and 'userName_' in x)
metadata['contributors'] = [clean_text(u.text) for u in users if u.text.strip()]
# Catalog (side navigation)
catalog_div = side_div.find('div', id="J-side-catalog")
if catalog_div:
items = catalog_div.find_all('a', class_=lambda x: x and 'catalogItem_' in x)
metadata['side_catalog'] = [clean_text(a.get_text()) for a in items]
return metadata
# Extracts the main textual content of the article from div elements matching contentTab_
def extract_main_content(soup):
sections = soup.find_all("div", class_=lambda x: x and "contentTab_" in x)
return "\n\n".join([clean_text(s.get_text()) for s in sections])
# Initializes a headless Chrome browser via Selenium
def init_browser():
options = Options()
options.add_argument('--headless') # Headless = runs in the background
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--lang=zh-CN') # set language to simplified Chinese
driver = webdriver.Chrome(options=options)
return driver
# Define Main scraper for one entry: Builds the URL using the item’s name; Loads the page and waits for a key element (#side) to appear; Checks if the page exists or is empty; If valid, extracts: main content and metadata; Returns all collected information in a structured dictionary.
def scrape_baike_page(driver, name):
url = f"https://baike.baidu.com/item/{name}"
try:
driver.get(url)
WebDriverWait(driver, 8).until(
EC.presence_of_element_located((By.ID, "side"))
)
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
if soup.find('div', class_=lambda x: x and 'sorryBox' in x):
return {
"name": name,
"url": url,
"error": "Page not found or does not exist"
}
if not soup.find("div", id="side") and not soup.find_all("div", class_=lambda x: x and "contentTab_" in x):
return {
"name": name,
"url": url,
"error": "No useful content found"
}
main_content = extract_main_content(soup)
metadata = extract_side_content(soup)
return {
"name": name,
"url": url,
"main_content": main_content or "N/A",
"metadata": metadata or {}
}
except Exception as e:
return {
"name": name,
"url": url,
"error": f"Selenium error: {type(e).__name__}: {str(e).splitlines()[0]}"
}
# Orchestrates the full workflow
def main(csv_path, output_path="baidu.json", delay=1):
df = pd.read_csv(csv_path)
names = df.iloc[:, 0].dropna().astype(str).tolist()
driver = init_browser()
results = []
for name in names:
print(f"Scraping: {name}")
result = scrape_baike_page(driver, name)
results.append(result)
time.sleep(delay)
driver.quit()
with open(output_path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\nSaved {len(results)} results to {output_path}")
return results
main("YTL_List.csv") # input list of names
Using this approach, we identified a total of 1,252 individuals: 361 from Wikipedia, 1,179 from Baidu, and 24 appearing in both knowledge bases. For those documented in both Wikipedia and Baidu, we prioritized their Baidu biographies, as these are generally more comprehensive.
After removing false positives, 1,079 unique individuals remained. Their regional distribution is as follows:
wikibai_id_text