Text preparation

The Who’s Who of American Returned Students (1917) is part of the “imh collection” of who’s who directories generously shared by the Institute of Modern History, Academia Sinica, Taipei. The plain texts of the biographies are stored on a SolR server and can be mined using the HistText R package.

Load the required packages:

library(histtext)
library(tidyverse)


To retrieve the full text, we need to find the title of the book and the name of the target fields:

histtext::list_filter_fields("imh-zh")
## [1] "book"   "bookno" "page"
histtext::list_possible_filters("imh-zh", "book") 


The book we are interested in is the one titled 游美同學錄 (Youmei tongxuelu). It contains 401 individual entries (biographies). We can now retrieve all biographies, in Chinese and English:

search_imh_zh <- histtext::search_documents_ex('*', corpus = "imh-zh", filter_query = list(book = "游美同學錄"))
search_imh_en <- histtext::search_documents_ex('*', corpus = "imh-en", filter_query = list(book = "游美同學錄"))


Convert row names into ID

search_imh_zh <- tibble::rowid_to_column(search_imh_zh, "ID")
search_imh_en <- tibble::rowid_to_column(search_imh_en, "ID")

search_imh_zh
search_imh_en


Retrieve full text

imh17_zh_docs <- get_documents(search_imh_zh, corpus = "imh-zh", batch_size = 10, verbose = FALSE)
imh17_eng_docs <- get_documents(search_imh_en, corpus = "imh-en", batch_size = 10, verbose = FALSE)


Convert row names into ID again

imh17_zh_docs <- tibble::rowid_to_column(imh17_zh_docs, "ID")
imh17_eng_docs <- tibble::rowid_to_column(imh17_eng_docs, "ID")


Measure length of biographies, based on the number of characters in Chinese, number of words (tokens) in English:

library(quanteda)
imh17_zh_docs <- imh17_zh_docs %>% mutate(length = nchar(Text))
imh17_eng_docs <- imh17_zh_docs %>% mutate(length = ntoken(Text))

imh17_zh_docs
imh17_eng_docs


Save datasets as csv files

write.csv(imh17_zh_docs, "imh17_zh_fulltext.csv")
write.csv(imh17_eng_docs, "imh17_eng_fulltext.csv")

Information extraction (Chinese)

First remove extra white space

imh17_zh_clean <- imh17_zh_docs %>% mutate(text_clean = str_squish(Text))
imh17_zh_clean <- imh17_zh_clean %>% mutate(text_clean = str_replace_all(text_clean, " ", ""))

Family data

Father

Extract father’s name

family <- imh17_zh_clean %>% mutate(father_name = str_extract(text_clean, "父\\s*(.*?)\\s*。")) 


Extract father’s occupation using the structure of the narrative (anything before the current address)

family <- family %>% mutate(father_occupation = str_extract(text_clean, "父\\s*(.*?)\\s*本籍住址|本籍通信處|永久通信處|永久住址|家中住址")) %>% 
  mutate(father_occupation = str_remove_all(father_occupation,"本籍住址")) %>%
  mutate(father_occupation = str_remove_all(father_occupation,"本籍通信處")) %>% 
  mutate(father_occupation = str_remove_all(father_occupation,"永久通信處")) %>% 
  mutate(father_occupation = str_remove_all(father_occupation,"永久住址")) %>% 
  mutate(father_occupation = str_remove_all(father_occupation,"家中住址")) %>% 
  mutate(father_occupation = str_remove_all(father_occupation, father_name))


Remove useless information from father’s name/occupation

family <- family %>% 
  mutate(father_name = str_remove_all(father_name,"。")) %>%
  mutate(father_name = str_remove_all(father_name,"父")) %>% 
  mutate(father_occupation = str_remove_all(father_occupation,"。")) %>% 
  mutate(father_occupation = str_remove_all(father_occupation,"已婚"))

Uncle

Extract uncle’s name and use the number of character to detect anomalies (names with less or more than 2 characters should be discarded)

family <- family %>% mutate(uncle_name = str_extract(text_clean, "叔\\s*(.*?)\\s*。")) %>% 
  mutate(uncle_name = str_remove_all(uncle_name,"。")) %>%
  mutate(uncle_name = str_remove_all(uncle_name,"叔"))  %>% 
  mutate(uncle_nchar = nchar(uncle_name))

Siblings

Elder brother (兄)

family <- family %>% mutate(xiong_name = str_extract(text_clean, "兄\\s*(.*?)\\s*。")) 
family <- family %>% mutate(xiong_name = str_remove_all(xiong_name,"。")) %>% # remove punctuation
  mutate(xiong_name = str_remove_all(xiong_name,"兄")) %>% 
  mutate(xiong_nchar = nchar(xiong_name)) # count characters to filter out strings with more than 4 characters

Younger brother (弟)

family <- family %>% mutate(di_name = str_extract(text_clean, "弟\\s*(.*?)\\s*。")) 
family <- family %>% mutate(di_name = str_remove_all(di_name,"。")) %>% # remove punctuation
  mutate(di_name = str_remove_all(di_name,"弟")) %>% 
  mutate(di_nchar = nchar(di_name))  # count characters to filter out strings with more than 4 characters

Marital status

We rely on pattern matching to retrieve information on their marital status (已婚 = married, 未婚 = unmarried):

married <- c("已婚", "未婚")
married_vec <- paste(married, sep = "", collapse = "|")
family <- family %>% mutate(married = str_extract(text_clean, married_vec)) 

Children

We also rely on pattern matching to extract information on the number of children. After a close examination of the data, we found that the maximum number of sons or daughters was 9. On this basis we create a vector of possible cases (ranging from 1 to 9 sons or daughters). We used the characters “子” and “女” as anchor for sons and daughters, as shown below:

sons <- c("子一", "子二", "子三", "子四", "子五", "子六", "子七", "子八", "子九")
son_vec <- paste(sons, sep = "", collapse = "|")
daugther <- c("女一", "女二", "女三", "女四", "女五", "女六", "女七", "女八", "女九")
daugther_vec <- paste(daugther, sep = "", collapse = "|")
family <- family %>% mutate(sons = str_extract(text_clean, son_vec)) %>% 
  mutate(sons = str_remove_all(sons,"子"))
family <- family %>% mutate(daugthers = str_extract(text_clean, daugther_vec)) %>% 
  mutate(daugthers = str_remove_all(daugthers,"女"))


Inspect last output with all family information:

head(family)

Education

Source of funding

To retrieve information related to the students’ source of funding, we again relied on pattern matching. We first closely read a sample of biographies to identify all possible types of funding. Then we create a vector listing the four possibles cases:

  • “官費遊美” (guanfei youmei): government sponsored student
  • “公費遊美” (gongfei youmei): other public funding (other than government)
  • “後得半官費” (houdeban guanfei): partial government scholarship
  • “自費遊美” (zifei youmei): self-funded student
funding <- c("官費遊美", "公費遊美", "半官費遊美", "自費遊美")
funding_vec <- paste(funding, sep = "", collapse = "|")
family_funding <- family %>% mutate(funding = str_extract(text_clean, funding_vec)) %>% 
  mutate(funding = str_remove_all(funding,"遊美")) 

Year of return

To retrieve then year when the students returned to China, we used the “search_concordance” function included in the histtext package:

search_imh_zh_conc <- histtext::search_concordance_ex('"回國" | "囘國"', 
                                                      corpus = "imh-zh", context_size = 15, 
                                                      filter_query = list(book = "游美同學錄"))


We found the pattern appears once in 366 biographies, twice in 6 of them (the students have been abroad and returned twice), whereas 35 biographies do not contain the pattern (either because the student have not returned, or because another expression was used):

search_imh_zh_conc %>% group_by(DocId) %>% count(sort = TRUE)


Find out who is missing:

setdiff(family$DocId, search_imh_zh_conc$DocId)
##  [1] "imh-11-12"  "imh-11-18"  "imh-11-24"  "imh-11-33"  "imh-11-44" 
##  [6] "imh-11-50"  "imh-11-95"  "imh-11-99"  "imh-11-106" "imh-11-113"
## [11] "imh-11-139" "imh-11-152" "imh-11-163" "imh-11-165" "imh-11-171"
## [16] "imh-11-175" "imh-11-205" "imh-11-208" "imh-11-229" "imh-11-246"
## [21] "imh-11-251" "imh-11-255" "imh-11-256" "imh-11-276" "imh-11-296"
## [26] "imh-11-324" "imh-11-334" "imh-11-335" "imh-11-338" "imh-11-342"
## [31] "imh-11-352" "imh-11-364" "imh-11-371" "imh-11-375" "imh-11-385"


Remove white spaces from “before” and “After”

imh_zh_conc <- search_imh_zh_conc %>% mutate(before_clean = str_replace_all(Before, " ", "")) %>% 
  mutate(after_clean = str_replace_all(After, " ", ""))


Clean the field “Before”

imh_zh_conc <- imh_zh_conc %>% mutate(return_date = str_sub(before_clean, - 7, - 1)) %>% 
    mutate(return_date_clean = str_replace_all(return_date, "年。", "年")) %>% 
    mutate(return_date_clean = str_remove(return_date_clean,".*。")) %>% 
    mutate(return_date_clean = str_replace_all(return_date_clean, "</p>", "")) %>% 
    mutate(return_date_clean = str_replace_all(return_date_clean, "p>", "")) %>%
    mutate(return_date_clean = str_replace_all(return_date_clean, "/", "")) %>%
      relocate(return_date_clean, .before = Matched)


Extract date patterns for further cleaning. We use a vectorized list of temporal referentials to be found in the text:

  • “光緖”: Guangxu emperor’s reign (1875-1908)
  • “宣統”: Xuantong (Pu Yi)’s reign (1909-1911)
  • “民國”: Republican calendar (1912-)
  • “是年”: this year
zh_date <- c("民國", "宣統", "光緖", "是年")
zh_date_vec <- paste(zh_date, sep = "", collapse = "|")
imh_zh_conc <- imh_zh_conc %>% mutate(date_zh = str_extract(return_date_clean, zh_date_vec)) %>%
  relocate(date_zh, .before = return_date_clean)


Clean the field “After”

imh_zh_conc <- imh_zh_conc %>% mutate(after_return_clean = str_replace_all(after_clean, "國 。", "")) %>%
  mutate(after_return_clean = str_remove(after_return_clean,".。 任*")) %>%
  mutate(after_return_clean = str_replace_all(after_return_clean,"。 ", "")) %>% 
  mutate(after_return_clean = str_replace_all(after_return_clean, " ", "")) %>% 
  mutate(after_return_clean = str_replace_all(after_return_clean,"。", "")) %>% 
  mutate(after_return_clean = str_replace_all(after_return_clean,"<", "")) %>% 
  mutate(after_return_clean = str_replace_all(after_return_clean,"p>", "")) %>%
  relocate(after_return_clean, .after = Matched)


Extract date patterns for further cleaning

imh_zh_conc <- imh_zh_conc %>% mutate(after_date_zh = str_extract(after_return_clean, zh_date_vec))  %>%
  relocate(after_date_zh, .after = after_return_clean)  %>%
  mutate(post_return = str_sub(after_return_clean, 1, 1)) %>% 
  relocate(post_return, .after = after_date_zh) 


Select variables for joining with family and funding data

conc_zh_to_join <- imh_zh_conc %>% select(DocId, date_zh, return_date_clean, Matched, after_return_clean, after_date_zh) %>% 
  rename(return_date = return_date_clean, post_return = after_return_clean)

imh_zh_conc_join <- full_join(family_funding, conc_zh_to_join, by = "DocId")  

imh_zh_conc_join

Places

Address

Extract and clean current address (in 1917)

library(strex)

address <- histtext::search_concordance_ex('"本籍住址" | "本籍通信處"|"永久通信處"|"永久住址"|"家中住址"', 
                                              corpus = "imh-zh", context_size = 30, 
                                              filter_query = list(book = "游美同學錄"))

address_clean <- address %>% select(DocId, Matched, After)

address_clean <- address_clean %>% mutate(address = str_before_nth(After, "。", 2)) %>%
  mutate(address = str_replace_all(address,"。", ""))  %>% 
  rename(address_to_clean = After) %>% 
  relocate(address_to_clean, .after = address) %>% 
  mutate(address_to_clean = str_remove_all(address_to_clean,"。")) 

imh_zh_conc_join <- full_join(imh_zh_conc_join, address_clean, by = "DocId")
## Warning in full_join(imh_zh_conc_join, address_clean, by = "DocId"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 314 of `x` matches multiple rows in `y`.
## ℹ Row 101 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

Ancestry (原籍)

ancestry <- histtext::search_concordance_ex('"原籍"', 
                                                   corpus = "imh-zh", context_size = 30, 
                                                   filter_query = list(book = "游美同學錄"))

ancestry_clean <- ancestry %>% select(DocId, After)

ancestry_clean <- ancestry_clean %>% mutate(ancestry = str_before_nth(After, "。", 1)) %>% select(DocId, ancestry)

imh_zh_conc_join <- full_join(imh_zh_conc_join, ancestry_clean, by = "DocId")

Save data

write.csv(imh_zh_conc_join, "imh_zh_fulltext_to_clean.csv")

Information extraction (English)

This section explains how we used concordance and regular expressions to retrieve information on the date of arrival and return, the type of funding, the preparation received before going to the United States, marital status, and the address in 1917. This method is based on a close reading of biographies to identify the most common terms used as triggers to retrieve the relevant information.

Arrival

To retrieve the year when the person arrived in America, we used the “search_concordance” function included in the “histtext” R package:

library(histtext)
library(tidyverse)

# search concordance 
imh_eng_arrived <- histtext::search_concordance_ex('"arrived in america"| "revisited america"', 
                                                      corpus = "imh-en", context_size = 50, 
                                                      filter_query = list(book = "游美同學錄"))

head(imh_eng_arrived)
# remove everything after year  
arrived_eng <- imh_eng_arrived %>% mutate(after_clean = str_remove(After, "[^0-9]+$")) 
arrived_eng <- arrived_eng %>% mutate(arrived_year = str_extract_numbers(After)) %>% # extract year
  mutate(arrived_year = as.character(arrived_year)) # convert list into string 

arrived_eng <- arrived_eng %>% mutate(arrived_year = as.character(arrived_year)) # note five issues due to page metadata, to correct manually 

# use extracted year to extract month 
arrived_eng <- arrived_eng %>% mutate(arrived_month = str_remove_all(after_clean, arrived_year)) %>% 
  mutate(arrived_month = str_replace_all(arrived_month, "[[:punct:]]", "")) %>% # remove punctuation
  mutate(arrived_month = str_trim(arrived_month)) %>% # remove white space
  mutate(arrived_month = str_to_title(arrived_month)) # uppercase month

# discard useless variables before joining

arrived_eng <- arrived_eng %>% select(DocId, Title, Matched, arrived_year, arrived_month)

head(arrived_eng)

Return

We use the same method for extracting the date of return:

# retrieve year when he returned to China using histtext "search_concordance" function

imh_eng_returned<- histtext::search_concordance_ex('"returned to china"', 
                                                    corpus = "imh-en", context_size = 200, 
                                                    filter_query = list(book = "游美同學錄"))

head(imh_eng_returned)
# identify whether individuals who returned more than once

imh_eng_returned_count <- imh_eng_returned %>% group_by(DocId) %>% 
  count(sort = TRUE) # 2 individuals returned twice (Id 272, 9)

# clean return date

imh_eng_returned <- imh_eng_returned %>% mutate(after_clean = str_remove(After, "[^0-9]+$")) # remove everything after year  
imh_eng_returned <- imh_eng_returned %>% mutate(returned_year = str_extract_numbers(After)) %>% # extract year
  mutate(returned_year = as.character(returned_year)) 

# use extracted year to extract month 
imh_eng_returned <- imh_eng_returned %>% mutate(returned_month = str_remove_all(after_clean, returned_year)) %>% 
  mutate(returned_month = str_replace_all(returned_month, "[[:punct:]]", "")) %>%  # remove punctuation
  mutate(returned_month = str_trim(returned_month)) %>%  # remove white space
  mutate(returned_month = str_to_title(returned_month)) # uppercase month

# discard useless variables before joining

returned_eng <- imh_eng_returned %>% select(DocId, Title, Matched, returned_year, returned_month)

head(returned_eng)

Funding

We also use concordance to retrieve information on funding:

# search the term "support" using concordance 

imh_eng_funding <- histtext::search_concordance_ex('"support"', 
                                                   corpus = "imh-en", context_size = 50, 
                                                   filter_query = list(book = "游美同學錄"))

# extract everything after "."
imh_eng_funding <- imh_eng_funding %>% 
  mutate(Before, funding=str_replace(Before,"[^\\.]+\\.","")) %>% 
  mutate(funding = str_remove_all(funding,"-|<p>|</p>")) %>% 
  mutate(funding = str_replace_all(funding, "[:digit:]", "")) %>% 
  mutate(funding=str_replace(funding,"[^\\.]+\\.",""))  %>% 
  mutate(funding=str_replace(funding,"[^\\,]+\\,","")) %>% 
  mutate(funding = str_replace_all(funding, "[[:punct:]]", "")) %>%  # remove punctuation
  mutate(funding = str_trim(funding))  # remove white space

head(imh_eng_funding)
# count type of funding source 

imh_eng_funding %>% group_by(funding) %>% count(sort = TRUE)
# lump together partial support 

imh_eng_funding <- imh_eng_funding %>% 
  mutate(funding = fct_collapse(funding,
                                Partial = c("Partial government", "Partialgovernment", "Partial Government"),
                                Government = "Government",
                                Private = "Private"))

# count again 

imh_eng_funding %>% count(funding)
# reassemble type of support 

imh_eng_funding <- imh_eng_funding %>% mutate(Matched = str_to_lower(Matched)) %>% 
  mutate(Funding = paste(funding, Matched, sep=" "))

# discard useless variables before joining

funding_eng <- imh_eng_funding %>% select(DocId, Title, Funding)

head(funding_eng)

Preparation

# search the term "prepare" and its variants using concordance and fuzzy search

# detect variants 
imh_eng_prepar_variants <- histtext::search_concordance_ex('prepar*', 
                                                       corpus = "imh-en", context_size = 150, 
                                                       filter_query = list(book = "游美同學錄"))

head(imh_eng_prepar_variants)
imh_eng_prepar_variants %>% group_by(Matched) %>% count(sort = TRUE)
# search the candidates terms (53 matches)

imh_eng_prepared <- histtext::search_concordance_ex('"prepared" | "Prepard" | "from preparatory"', 
                                                    corpus = "imh-en", context_size = 150, 
                                                    filter_query = list(book = "游美同學錄"))

head(imh_eng_prepared)


Extract the name of the preparatory institution:

# extract everything before the full point or semi colon

imh_eng_prepa_instit <- imh_eng_prepared %>% mutate(preparation = str_extract(After, "^[^\\.|\\;:]+"))

head(imh_eng_prepa_instit)
# extract date/year of preparation 

imh_eng_prepa_instit <- imh_eng_prepa_instit %>% mutate(prepared_date = str_extract_all(preparation, "[:digit:]"))

imh_eng_prepa_instit <- imh_eng_prepa_instit %>% mutate(prepared_date = str_extract_numbers(preparation)) %>% 
  mutate(prepared_date = as.character(prepared_date))

# separate start and end year 

imh_eng_prepa_instit <- imh_eng_prepa_instit %>% mutate(prepared_date = str_remove_all(prepared_date,"c\\(")) %>% 
  mutate(prepared_date = str_remove_all(prepared_date,"\\)")) %>% 
  mutate(prepared_date,
                          start_prep=str_extract(prepared_date,"[^,]+,"),
                          end_prep=str_extract(prepared_date,",.*")) %>% 
  mutate(start_prep = str_remove_all(start_prep,",")) %>% 
  mutate(end_prep = str_remove_all(end_prep,", ")) %>% 
  mutate(end_year_nchar = nchar(end_prep))

# extract name of institution 

imh_eng_prepa_instit <- imh_eng_prepa_instit %>% mutate(prepar_instit = str_remove_all(preparation,"Department, ")) %>% 
  mutate(prepar_instit = str_remove_all(prepar_instit,"for college at ")) %>% 
  mutate(prepar_instit = str_remove_all(prepar_instit,"for collegeat ")) %>% 
  mutate(prepar_instit = str_remove_all(prepar_instit,"for college at")) %>% 
  mutate(prepar_instit = str_remove_all(prepar_instit,"forcollege at"))  %>% 
  mutate(prepar_instit = str_remove_all(prepar_instit,"for college in ")) %>% 
  mutate(prepar_instit = str_remove_all(prepar_instit, "[:digit:]"))  %>% 
  mutate(prepar_instit = str_remove_all(prepar_instit, ", -"))   %>% 
  mutate(prepar_instit = str_remove_all(prepar_instit, ",-"))  %>%      
  mutate(prepar_instit = str_trim(prepar_instit))  # remove white space  

# extract everything after comma

imh_eng_prepa_instit <- imh_eng_prepa_instit %>%  
  mutate(prepar_instit,
         prep_instit_clean =str_extract(prepar_instit,"[^,]+,"),
         prep_location =str_extract(prepar_instit,",.*")) %>% 
  mutate(prep_instit_clean = str_remove_all(prep_instit_clean, "[[:punct:]]"))   %>% 
  mutate(prep_location = str_remove_all(prep_location, "[[:punct:]]"))  %>%      
  mutate(prep_location = str_trim(prep_location))  # remove white space 


# discard useless variables before joining and cleaning

prepared_eng <- imh_eng_prepa_instit %>% select(DocId, Title, After, 
                                 prepar_instit, prep_instit_clean, prep_location, 
                                 prepared_date, start_prep, end_prep, end_year_nchar)

head(prepared_eng)

Marriage

We also use concordance to retrieve information on marriage:

# search the term "married" using concordance 

imh_eng_married <- histtext::search_concordance_ex('"married"', 
                                                    corpus = "imh-en", context_size = 25, 
                                                    filter_query = list(book = "游美同學錄"))

head(imh_eng_married)
imh_eng_married <- imh_eng_married %>% 
  mutate(married_year = str_extract_numbers(After)) %>% # extract year
  mutate(married_year = as.character(married_year)) 

# discard useless variables before joining

married_eng <- imh_eng_married %>% select(DocId, Title, married_year)

head(married_eng)

Address

# search the term "address" using concordance

imh_eng_address <- histtext::search_concordance_ex('"address"', 
                                                   corpus = "imh-en", context_size = 150, 
                                                   filter_query = list(book = "游美同學錄"))

imh_eng_address <- imh_eng_address %>% 
  mutate(address = str_remove_all(After,":|: |- ")) %>%
  mutate(address = str_trim(address)) %>%
  mutate(address = str_replace(address,"c/o","c/o ")) %>%
  mutate(address = str_replace(address,"49Porland","49 Porland")) 

# discard useless variables before joining

address_eng <- imh_eng_address %>% select(DocId, Title, address)

Compile results

imh_eng_join_to_clean <- full_join(arrived_eng, returned_eng, by = "DocId")
## Warning in full_join(arrived_eng, returned_eng, by = "DocId"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
imh_eng_join_to_clean <- full_join(imh_eng_join_to_clean, married_eng, by = "DocId")
imh_eng_join_to_clean <- full_join(imh_eng_join_to_clean, funding_eng, by = "DocId")
imh_eng_join_to_clean <- full_join(imh_eng_join_to_clean, prepared_eng, by = "DocId")
imh_eng_join_to_clean <- full_join(imh_eng_join_to_clean, address_eng, by = "DocId")

Named Entity Recognition (Chinese)

For Named Entity Recognition (NER), we rely on the ner_on_corpus function included in the histtext R package.

Extraction

# load packages

library(histtext)
library(tidyverse)


search_imh_zh <- histtext::search_documents_ex('*', corpus = "imh-zh", filter_query = list(book = "游美同學錄"))

# extract named entities 

ner_imh17_zh <- ner_on_corpus(search_imh_zh, corpus = "imh-zh", only_precomputed = TRUE)
## 1/401
## 11/401
## 21/401
## 31/401
## 41/401
## 51/401
## 61/401
## 71/401
## 81/401
## 91/401
## 101/401
## 111/401
## 121/401
## 131/401
## 141/401
## 151/401
## 161/401
## 171/401
## 181/401
## 191/401
## 201/401
## 211/401
## 221/401
## 231/401
## 241/401
## 251/401
## 261/401
## 271/401
## 281/401
## 291/401
## 301/401
## 311/401
## 321/401
## 331/401
## 341/401
## 351/401
## 361/401
## 371/401
## 381/401
## 391/401
## 401/401
head(ner_imh17_zh)
# count entities types

ner_zh_count <- ner_imh17_zh %>% group_by(Type) %>% count(sort = TRUE)

ner_zh_count
# focus on organizations

ner_imh17_zh_org <- ner_imh17_zh %>% filter(Type == "ORG")

head(ner_imh17_zh_org)

Curation

# remove white spaces

ner_imh17_zh_org <- ner_imh17_zh_org %>% 
  mutate(Text_clean = str_replace(Text," ",""))  %>%  
  relocate(Text_clean, .after = Text) 

# remove Chinese punctuation

ner_imh17_zh_org <- ner_imh17_zh_org %>%  
  mutate(Text_clean = str_replace_all(Text_clean, "。", ""))

# remove non sinograms 

ner_imh17_zh_org <- ner_imh17_zh_org %>%  
  mutate(Text_clean = str_replace_all(Text_clean, "<p>", ""))  %>%
  mutate(Text_clean = str_replace_all(Text_clean, "<", ""))  %>%
  mutate(Text_clean = str_replace_all(Text_clean, "[:digit:]", ""))  

# substract first characters to identify verbs of action

ner_imh17_zh_org <- ner_imh17_zh_org  %>%
  mutate(pref1 = str_sub(Text_clean, 1, 1))  %>%
  mutate(pref2 = str_sub(Text_clean, 1, 2)) %>%
  relocate(pref1, .before =  Text_clean) %>%
  relocate(pref2, .after =  pref1) 

pref1 <- ner_imh17_zh_org %>% group_by(pref1) %>% count()
pref2 <- ner_imh17_zh_org %>% group_by(pref2) %>% count()

# remove verbs (任|入|伊|於|爲|充) 

ner_imh17_zh_org <- ner_imh17_zh_org %>%  
  mutate(Text_clean = str_replace_all(Text_clean, "任|入|伊|於|爲|充", "")) 

# count length

ner_imh17_zh_org <-  ner_imh17_zh_org %>% mutate(length = nchar(Text_clean)) 

# substract last characters to classify organizations


ner_imh17_zh_org <- ner_imh17_zh_org  %>% 
  mutate(Text_clean = str_remove_all(Text_clean, "長$")) %>%
  mutate(suff1 = str_sub(Text_clean, - 1, - 1)) %>% 
  mutate(suff2 = str_sub(Text_clean, - 2, - 1)) %>% 
  relocate(suff1, .after = Text_clean) %>%  
  relocate(suff2, .after = suff1) 

# export list of organization for further cleaning in Excel (not run)
# write.csv(ner_imh17_zh_org, "ner_imh17_zh_org.csv")

# export suffixes/prefixes to annotate and create ontologies of organizations and positions (not run)

suff1 <- ner_imh17_zh_org %>% group_by(suff1) %>% count()
suff2 <- ner_imh17_zh_org %>% group_by(suff2) %>% count() 

# write.csv(suff1, "ner_suff1.csv")
# write.csv(suff2, "ner_suff2.csv")
# write.csv(pref1, "ner_pref1.csv")
# write.csv(pref2, "ner_pref2.csv")

Named Entity Recognition (English)

For Named Entity Recognition (NER), we rely on the ner_on_corpus function included in the histtext R package.

Extraction

# load packages

library(histtext)
library(tidyverse)


search_imh_en <- histtext::search_documents_ex('*', corpus = "imh-en", filter_query = list(book = "游美同學錄"))

# extract named entities 

ner_imh17_en <- ner_on_corpus(search_imh_en, corpus = "imh-en", only_precomputed = TRUE)
## 1/401
## 11/401
## 21/401
## 31/401
## 41/401
## 51/401
## 61/401
## 71/401
## 81/401
## 91/401
## 101/401
## 111/401
## 121/401
## 131/401
## 141/401
## 151/401
## 161/401
## 171/401
## 181/401
## 191/401
## 201/401
## 211/401
## 221/401
## 231/401
## 241/401
## 251/401
## 261/401
## 271/401
## 281/401
## 291/401
## 301/401
## 311/401
## 321/401
## 331/401
## 341/401
## 351/401
## 361/401
## 371/401
## 381/401
## 391/401
## 401/401
head(ner_imh17_en)
# count entities types

ner_en_count <- ner_imh17_en %>% group_by(Type) %>% count(sort = TRUE)

ner_en_count
# focus on organizations

ner_imh17_en_org <- ner_imh17_en %>% filter(Type == "ORG")

head(ner_imh17_en_org)

Curation

imh17_eng_org <- ner_imh17_en_org  %>%
  mutate(Text_clean = str_replace(Text,"Anglo- Chinese College","Anglo-Chinese College"))  %>%  
  relocate(Text_clean, .after = Text) %>% 
  mutate(Text_clean = str_replace(Text_clean,"Nanchang- Kiukiang","Nanchang-Kiukiang")) %>%
  mutate(Text_clean = str_replace(Text_clean,"Nanking- Hunan","Nanking-Hunan")) %>%
  mutate(Text_clean = str_replace(Text_clean,"Peking- Kalgan","Peking-Kalgan")) %>%
  mutate(Text_clean = str_replace(Text_clean,"Shanghai- Nanking","Shanghai-Nanking")) %>%
  mutate(Text_clean = str_replace(Text_clean,"Shanghai- Hangchow","Shanghai-Hangchow"))%>%
  mutate(Text_clean = str_replace(Text_clean,"- ",""))  %>%  
  mutate(Text_clean = str_replace(Text_clean,"Co\\.","Company")) %>% 
  mutate(Text_clean = str_replace(Text_clean,"20th","Twentieth")) %>%
  mutate(Text_clean = str_replace_all(Text_clean, "<p>", ""))  %>%
  mutate(Text_clean = str_replace_all(Text_clean, "</p>", ""))  %>%
  mutate(Text_clean = str_replace_all(Text_clean, "[:digit:]", ""))  %>% 
  mutate(Text_clean = str_replace(Text_clean,"The-Hua","Teh-Hua")) %>% 
  mutate(Text_clean = str_replace(Text_clean,"Inn,","Lincoln's Inn"))   %>%
  mutate(Text_clean = str_replace(Text_clean,"Yale,","Yale University"))  %>%
  mutate(Text_clean = str_replace(Text_clean,"Yale\\.","Yale University"))%>% 
  mutate(Text_clean = str_replace(Text_clean, "^the ", "")) %>% 
  mutate(Text_clean = str_replace(Text_clean, "^The ", "")) %>% 
  mutate(Text_clean = str_replace(Text_clean,"and Accounts;","Railway Finance and Accounts")) %>%
  mutate(Text_clean = str_replace(Text_clean,"Fuh-Tan","Fu Tan")) %>%
  mutate(Text_clean = str_replace(Text_clean,"Futan","Fu Tan"))  %>%
  mutate(Text_clean = str_replace(Text_clean,"Universities","University")) %>%
  mutate(Text_clean = str_replace(Text_clean,"Railways","Railway")) %>%
  mutate(Text_clean = str_replace(Text_clean,"& Company","& Company")) %>%
  mutate(Text_clean = str_replace(Text_clean,"M.I.T\\.","Massachusetts Institute of Technology")) %>%
  mutate(Text_clean = str_remove_all(Text_clean, "[\\p{P}\\p{S}&&[^-&'.]]"))%>%
  mutate(Text_clean = str_replace(Text_clean, "\\.$", "")) %>% 
  mutate(Text_clean = str_replace(Text_clean,"Massachussetts","Massachusetts")) %>%
  mutate(Text_clean = str_replace(Text_clean,"Massachusatts","Massachusetts")) %>%
  mutate(Text_clean = str_replace(Text_clean,"Simga","Sigma")) 


# remove NA 

imh17_eng_org_no_na <- imh17_eng_org %>% filter(!is.na(Text_clean))

# count length 

imh17_eng_org_no_na <-  imh17_eng_org_no_na %>% mutate(length = nchar(Text_clean)) 

# export results for further cleaning in Excel (not run)
# write.csv(imh17_eng_org_no_na, "imh17_eng_org.csv")