Abstract
This document explains how to use HistText and more generic packages to retrieve and clean the relevant historical information from bilingual biographies, taking the Who’s Who of American returned students (1917) as a case study.
The Who’s Who of American Returned Students (1917) is part of the “imh collection” of who’s who directories generously shared by the Institute of Modern History, Academia Sinica, Taipei. The plain texts of the biographies are stored on a SolR server and can be mined using the HistText R package.
Load the required packages:
library(histtext)
library(tidyverse)
To retrieve the full text, we need to find the title of the book
and the name of the target fields:
histtext::list_filter_fields("imh-zh")
## [1] "book" "bookno" "page"
histtext::list_possible_filters("imh-zh", "book")
The book we are interested in is the one titled 游美同學錄
(Youmei tongxuelu). It contains 401 individual entries (biographies). We
can now retrieve all biographies, in Chinese and English:
search_imh_zh <- histtext::search_documents_ex('*', corpus = "imh-zh", filter_query = list(book = "游美同學錄"))
search_imh_en <- histtext::search_documents_ex('*', corpus = "imh-en", filter_query = list(book = "游美同學錄"))
Convert row names into ID
search_imh_zh <- tibble::rowid_to_column(search_imh_zh, "ID")
search_imh_en <- tibble::rowid_to_column(search_imh_en, "ID")
search_imh_zh
search_imh_en
Retrieve full text
imh17_zh_docs <- get_documents(search_imh_zh, corpus = "imh-zh", batch_size = 10, verbose = FALSE)
imh17_eng_docs <- get_documents(search_imh_en, corpus = "imh-en", batch_size = 10, verbose = FALSE)
Convert row names into ID again
imh17_zh_docs <- tibble::rowid_to_column(imh17_zh_docs, "ID")
imh17_eng_docs <- tibble::rowid_to_column(imh17_eng_docs, "ID")
Measure length of biographies, based on the number of characters
in Chinese, number of words (tokens) in English:
library(quanteda)
imh17_zh_docs <- imh17_zh_docs %>% mutate(length = nchar(Text))
imh17_eng_docs <- imh17_zh_docs %>% mutate(length = ntoken(Text))
imh17_zh_docs
imh17_eng_docs
Save datasets as csv files
write.csv(imh17_zh_docs, "imh17_zh_fulltext.csv")
write.csv(imh17_eng_docs, "imh17_eng_fulltext.csv")
First remove extra white space
imh17_zh_clean <- imh17_zh_docs %>% mutate(text_clean = str_squish(Text))
imh17_zh_clean <- imh17_zh_clean %>% mutate(text_clean = str_replace_all(text_clean, " ", ""))
Extract father’s name
family <- imh17_zh_clean %>% mutate(father_name = str_extract(text_clean, "父\\s*(.*?)\\s*。"))
Extract father’s occupation using the structure of the narrative
(anything before the current address)
family <- family %>% mutate(father_occupation = str_extract(text_clean, "父\\s*(.*?)\\s*本籍住址|本籍通信處|永久通信處|永久住址|家中住址")) %>%
mutate(father_occupation = str_remove_all(father_occupation,"本籍住址")) %>%
mutate(father_occupation = str_remove_all(father_occupation,"本籍通信處")) %>%
mutate(father_occupation = str_remove_all(father_occupation,"永久通信處")) %>%
mutate(father_occupation = str_remove_all(father_occupation,"永久住址")) %>%
mutate(father_occupation = str_remove_all(father_occupation,"家中住址")) %>%
mutate(father_occupation = str_remove_all(father_occupation, father_name))
Remove useless information from father’s name/occupation
family <- family %>%
mutate(father_name = str_remove_all(father_name,"。")) %>%
mutate(father_name = str_remove_all(father_name,"父")) %>%
mutate(father_occupation = str_remove_all(father_occupation,"。")) %>%
mutate(father_occupation = str_remove_all(father_occupation,"已婚"))
Extract uncle’s name and use the number of character to detect anomalies (names with less or more than 2 characters should be discarded)
family <- family %>% mutate(uncle_name = str_extract(text_clean, "叔\\s*(.*?)\\s*。")) %>%
mutate(uncle_name = str_remove_all(uncle_name,"。")) %>%
mutate(uncle_name = str_remove_all(uncle_name,"叔")) %>%
mutate(uncle_nchar = nchar(uncle_name))
Elder brother (兄)
family <- family %>% mutate(xiong_name = str_extract(text_clean, "兄\\s*(.*?)\\s*。"))
family <- family %>% mutate(xiong_name = str_remove_all(xiong_name,"。")) %>% # remove punctuation
mutate(xiong_name = str_remove_all(xiong_name,"兄")) %>%
mutate(xiong_nchar = nchar(xiong_name)) # count characters to filter out strings with more than 4 characters
Younger brother (弟)
family <- family %>% mutate(di_name = str_extract(text_clean, "弟\\s*(.*?)\\s*。"))
family <- family %>% mutate(di_name = str_remove_all(di_name,"。")) %>% # remove punctuation
mutate(di_name = str_remove_all(di_name,"弟")) %>%
mutate(di_nchar = nchar(di_name)) # count characters to filter out strings with more than 4 characters
We rely on pattern matching to retrieve information on their marital status (已婚 = married, 未婚 = unmarried):
married <- c("已婚", "未婚")
married_vec <- paste(married, sep = "", collapse = "|")
family <- family %>% mutate(married = str_extract(text_clean, married_vec))
We also rely on pattern matching to extract information on the number of children. After a close examination of the data, we found that the maximum number of sons or daughters was 9. On this basis we create a vector of possible cases (ranging from 1 to 9 sons or daughters). We used the characters “子” and “女” as anchor for sons and daughters, as shown below:
sons <- c("子一", "子二", "子三", "子四", "子五", "子六", "子七", "子八", "子九")
son_vec <- paste(sons, sep = "", collapse = "|")
daugther <- c("女一", "女二", "女三", "女四", "女五", "女六", "女七", "女八", "女九")
daugther_vec <- paste(daugther, sep = "", collapse = "|")
family <- family %>% mutate(sons = str_extract(text_clean, son_vec)) %>%
mutate(sons = str_remove_all(sons,"子"))
family <- family %>% mutate(daugthers = str_extract(text_clean, daugther_vec)) %>%
mutate(daugthers = str_remove_all(daugthers,"女"))
Inspect last output with all family information:
head(family)
To retrieve information related to the students’ source of funding, we again relied on pattern matching. We first closely read a sample of biographies to identify all possible types of funding. Then we create a vector listing the four possibles cases:
funding <- c("官費遊美", "公費遊美", "半官費遊美", "自費遊美")
funding_vec <- paste(funding, sep = "", collapse = "|")
family_funding <- family %>% mutate(funding = str_extract(text_clean, funding_vec)) %>%
mutate(funding = str_remove_all(funding,"遊美"))
To retrieve then year when the students returned to China, we used the “search_concordance” function included in the histtext package:
search_imh_zh_conc <- histtext::search_concordance_ex('"回國" | "囘國"',
corpus = "imh-zh", context_size = 15,
filter_query = list(book = "游美同學錄"))
We found the pattern appears once in 366 biographies, twice in 6
of them (the students have been abroad and returned twice), whereas 35
biographies do not contain the pattern (either because the student have
not returned, or because another expression was used):
search_imh_zh_conc %>% group_by(DocId) %>% count(sort = TRUE)
Find out who is missing:
setdiff(family$DocId, search_imh_zh_conc$DocId)
## [1] "imh-11-12" "imh-11-18" "imh-11-24" "imh-11-33" "imh-11-44"
## [6] "imh-11-50" "imh-11-95" "imh-11-99" "imh-11-106" "imh-11-113"
## [11] "imh-11-139" "imh-11-152" "imh-11-163" "imh-11-165" "imh-11-171"
## [16] "imh-11-175" "imh-11-205" "imh-11-208" "imh-11-229" "imh-11-246"
## [21] "imh-11-251" "imh-11-255" "imh-11-256" "imh-11-276" "imh-11-296"
## [26] "imh-11-324" "imh-11-334" "imh-11-335" "imh-11-338" "imh-11-342"
## [31] "imh-11-352" "imh-11-364" "imh-11-371" "imh-11-375" "imh-11-385"
Remove white spaces from “before” and “After”
imh_zh_conc <- search_imh_zh_conc %>% mutate(before_clean = str_replace_all(Before, " ", "")) %>%
mutate(after_clean = str_replace_all(After, " ", ""))
Clean the field “Before”
imh_zh_conc <- imh_zh_conc %>% mutate(return_date = str_sub(before_clean, - 7, - 1)) %>%
mutate(return_date_clean = str_replace_all(return_date, "年。", "年")) %>%
mutate(return_date_clean = str_remove(return_date_clean,".*。")) %>%
mutate(return_date_clean = str_replace_all(return_date_clean, "</p>", "")) %>%
mutate(return_date_clean = str_replace_all(return_date_clean, "p>", "")) %>%
mutate(return_date_clean = str_replace_all(return_date_clean, "/", "")) %>%
relocate(return_date_clean, .before = Matched)
Extract date patterns for further cleaning. We use a vectorized
list of temporal referentials to be found in the text:
zh_date <- c("民國", "宣統", "光緖", "是年")
zh_date_vec <- paste(zh_date, sep = "", collapse = "|")
imh_zh_conc <- imh_zh_conc %>% mutate(date_zh = str_extract(return_date_clean, zh_date_vec)) %>%
relocate(date_zh, .before = return_date_clean)
Clean the field “After”
imh_zh_conc <- imh_zh_conc %>% mutate(after_return_clean = str_replace_all(after_clean, "國 。", "")) %>%
mutate(after_return_clean = str_remove(after_return_clean,".。 任*")) %>%
mutate(after_return_clean = str_replace_all(after_return_clean,"。 ", "")) %>%
mutate(after_return_clean = str_replace_all(after_return_clean, " ", "")) %>%
mutate(after_return_clean = str_replace_all(after_return_clean,"。", "")) %>%
mutate(after_return_clean = str_replace_all(after_return_clean,"<", "")) %>%
mutate(after_return_clean = str_replace_all(after_return_clean,"p>", "")) %>%
relocate(after_return_clean, .after = Matched)
Extract date patterns for further cleaning
imh_zh_conc <- imh_zh_conc %>% mutate(after_date_zh = str_extract(after_return_clean, zh_date_vec)) %>%
relocate(after_date_zh, .after = after_return_clean) %>%
mutate(post_return = str_sub(after_return_clean, 1, 1)) %>%
relocate(post_return, .after = after_date_zh)
Select variables for joining with family and funding data
conc_zh_to_join <- imh_zh_conc %>% select(DocId, date_zh, return_date_clean, Matched, after_return_clean, after_date_zh) %>%
rename(return_date = return_date_clean, post_return = after_return_clean)
imh_zh_conc_join <- full_join(family_funding, conc_zh_to_join, by = "DocId")
imh_zh_conc_join
Extract and clean current address (in 1917)
library(strex)
address <- histtext::search_concordance_ex('"本籍住址" | "本籍通信處"|"永久通信處"|"永久住址"|"家中住址"',
corpus = "imh-zh", context_size = 30,
filter_query = list(book = "游美同學錄"))
address_clean <- address %>% select(DocId, Matched, After)
address_clean <- address_clean %>% mutate(address = str_before_nth(After, "。", 2)) %>%
mutate(address = str_replace_all(address,"。", "")) %>%
rename(address_to_clean = After) %>%
relocate(address_to_clean, .after = address) %>%
mutate(address_to_clean = str_remove_all(address_to_clean,"。"))
imh_zh_conc_join <- full_join(imh_zh_conc_join, address_clean, by = "DocId")
## Warning in full_join(imh_zh_conc_join, address_clean, by = "DocId"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 314 of `x` matches multiple rows in `y`.
## ℹ Row 101 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
ancestry <- histtext::search_concordance_ex('"原籍"',
corpus = "imh-zh", context_size = 30,
filter_query = list(book = "游美同學錄"))
ancestry_clean <- ancestry %>% select(DocId, After)
ancestry_clean <- ancestry_clean %>% mutate(ancestry = str_before_nth(After, "。", 1)) %>% select(DocId, ancestry)
imh_zh_conc_join <- full_join(imh_zh_conc_join, ancestry_clean, by = "DocId")
write.csv(imh_zh_conc_join, "imh_zh_fulltext_to_clean.csv")
This section explains how we used concordance and regular expressions to retrieve information on the date of arrival and return, the type of funding, the preparation received before going to the United States, marital status, and the address in 1917. This method is based on a close reading of biographies to identify the most common terms used as triggers to retrieve the relevant information.
To retrieve the year when the person arrived in America, we used the “search_concordance” function included in the “histtext” R package:
library(histtext)
library(tidyverse)
# search concordance
imh_eng_arrived <- histtext::search_concordance_ex('"arrived in america"| "revisited america"',
corpus = "imh-en", context_size = 50,
filter_query = list(book = "游美同學錄"))
head(imh_eng_arrived)
# remove everything after year
arrived_eng <- imh_eng_arrived %>% mutate(after_clean = str_remove(After, "[^0-9]+$"))
arrived_eng <- arrived_eng %>% mutate(arrived_year = str_extract_numbers(After)) %>% # extract year
mutate(arrived_year = as.character(arrived_year)) # convert list into string
arrived_eng <- arrived_eng %>% mutate(arrived_year = as.character(arrived_year)) # note five issues due to page metadata, to correct manually
# use extracted year to extract month
arrived_eng <- arrived_eng %>% mutate(arrived_month = str_remove_all(after_clean, arrived_year)) %>%
mutate(arrived_month = str_replace_all(arrived_month, "[[:punct:]]", "")) %>% # remove punctuation
mutate(arrived_month = str_trim(arrived_month)) %>% # remove white space
mutate(arrived_month = str_to_title(arrived_month)) # uppercase month
# discard useless variables before joining
arrived_eng <- arrived_eng %>% select(DocId, Title, Matched, arrived_year, arrived_month)
head(arrived_eng)
We use the same method for extracting the date of return:
# retrieve year when he returned to China using histtext "search_concordance" function
imh_eng_returned<- histtext::search_concordance_ex('"returned to china"',
corpus = "imh-en", context_size = 200,
filter_query = list(book = "游美同學錄"))
head(imh_eng_returned)
# identify whether individuals who returned more than once
imh_eng_returned_count <- imh_eng_returned %>% group_by(DocId) %>%
count(sort = TRUE) # 2 individuals returned twice (Id 272, 9)
# clean return date
imh_eng_returned <- imh_eng_returned %>% mutate(after_clean = str_remove(After, "[^0-9]+$")) # remove everything after year
imh_eng_returned <- imh_eng_returned %>% mutate(returned_year = str_extract_numbers(After)) %>% # extract year
mutate(returned_year = as.character(returned_year))
# use extracted year to extract month
imh_eng_returned <- imh_eng_returned %>% mutate(returned_month = str_remove_all(after_clean, returned_year)) %>%
mutate(returned_month = str_replace_all(returned_month, "[[:punct:]]", "")) %>% # remove punctuation
mutate(returned_month = str_trim(returned_month)) %>% # remove white space
mutate(returned_month = str_to_title(returned_month)) # uppercase month
# discard useless variables before joining
returned_eng <- imh_eng_returned %>% select(DocId, Title, Matched, returned_year, returned_month)
head(returned_eng)
We also use concordance to retrieve information on funding:
# search the term "support" using concordance
imh_eng_funding <- histtext::search_concordance_ex('"support"',
corpus = "imh-en", context_size = 50,
filter_query = list(book = "游美同學錄"))
# extract everything after "."
imh_eng_funding <- imh_eng_funding %>%
mutate(Before, funding=str_replace(Before,"[^\\.]+\\.","")) %>%
mutate(funding = str_remove_all(funding,"-|<p>|</p>")) %>%
mutate(funding = str_replace_all(funding, "[:digit:]", "")) %>%
mutate(funding=str_replace(funding,"[^\\.]+\\.","")) %>%
mutate(funding=str_replace(funding,"[^\\,]+\\,","")) %>%
mutate(funding = str_replace_all(funding, "[[:punct:]]", "")) %>% # remove punctuation
mutate(funding = str_trim(funding)) # remove white space
head(imh_eng_funding)
# count type of funding source
imh_eng_funding %>% group_by(funding) %>% count(sort = TRUE)
# lump together partial support
imh_eng_funding <- imh_eng_funding %>%
mutate(funding = fct_collapse(funding,
Partial = c("Partial government", "Partialgovernment", "Partial Government"),
Government = "Government",
Private = "Private"))
# count again
imh_eng_funding %>% count(funding)
# reassemble type of support
imh_eng_funding <- imh_eng_funding %>% mutate(Matched = str_to_lower(Matched)) %>%
mutate(Funding = paste(funding, Matched, sep=" "))
# discard useless variables before joining
funding_eng <- imh_eng_funding %>% select(DocId, Title, Funding)
head(funding_eng)
# search the term "prepare" and its variants using concordance and fuzzy search
# detect variants
imh_eng_prepar_variants <- histtext::search_concordance_ex('prepar*',
corpus = "imh-en", context_size = 150,
filter_query = list(book = "游美同學錄"))
head(imh_eng_prepar_variants)
imh_eng_prepar_variants %>% group_by(Matched) %>% count(sort = TRUE)
# search the candidates terms (53 matches)
imh_eng_prepared <- histtext::search_concordance_ex('"prepared" | "Prepard" | "from preparatory"',
corpus = "imh-en", context_size = 150,
filter_query = list(book = "游美同學錄"))
head(imh_eng_prepared)
Extract the name of the preparatory institution:
# extract everything before the full point or semi colon
imh_eng_prepa_instit <- imh_eng_prepared %>% mutate(preparation = str_extract(After, "^[^\\.|\\;:]+"))
head(imh_eng_prepa_instit)
# extract date/year of preparation
imh_eng_prepa_instit <- imh_eng_prepa_instit %>% mutate(prepared_date = str_extract_all(preparation, "[:digit:]"))
imh_eng_prepa_instit <- imh_eng_prepa_instit %>% mutate(prepared_date = str_extract_numbers(preparation)) %>%
mutate(prepared_date = as.character(prepared_date))
# separate start and end year
imh_eng_prepa_instit <- imh_eng_prepa_instit %>% mutate(prepared_date = str_remove_all(prepared_date,"c\\(")) %>%
mutate(prepared_date = str_remove_all(prepared_date,"\\)")) %>%
mutate(prepared_date,
start_prep=str_extract(prepared_date,"[^,]+,"),
end_prep=str_extract(prepared_date,",.*")) %>%
mutate(start_prep = str_remove_all(start_prep,",")) %>%
mutate(end_prep = str_remove_all(end_prep,", ")) %>%
mutate(end_year_nchar = nchar(end_prep))
# extract name of institution
imh_eng_prepa_instit <- imh_eng_prepa_instit %>% mutate(prepar_instit = str_remove_all(preparation,"Department, ")) %>%
mutate(prepar_instit = str_remove_all(prepar_instit,"for college at ")) %>%
mutate(prepar_instit = str_remove_all(prepar_instit,"for collegeat ")) %>%
mutate(prepar_instit = str_remove_all(prepar_instit,"for college at")) %>%
mutate(prepar_instit = str_remove_all(prepar_instit,"forcollege at")) %>%
mutate(prepar_instit = str_remove_all(prepar_instit,"for college in ")) %>%
mutate(prepar_instit = str_remove_all(prepar_instit, "[:digit:]")) %>%
mutate(prepar_instit = str_remove_all(prepar_instit, ", -")) %>%
mutate(prepar_instit = str_remove_all(prepar_instit, ",-")) %>%
mutate(prepar_instit = str_trim(prepar_instit)) # remove white space
# extract everything after comma
imh_eng_prepa_instit <- imh_eng_prepa_instit %>%
mutate(prepar_instit,
prep_instit_clean =str_extract(prepar_instit,"[^,]+,"),
prep_location =str_extract(prepar_instit,",.*")) %>%
mutate(prep_instit_clean = str_remove_all(prep_instit_clean, "[[:punct:]]")) %>%
mutate(prep_location = str_remove_all(prep_location, "[[:punct:]]")) %>%
mutate(prep_location = str_trim(prep_location)) # remove white space
# discard useless variables before joining and cleaning
prepared_eng <- imh_eng_prepa_instit %>% select(DocId, Title, After,
prepar_instit, prep_instit_clean, prep_location,
prepared_date, start_prep, end_prep, end_year_nchar)
head(prepared_eng)
We also use concordance to retrieve information on marriage:
# search the term "married" using concordance
imh_eng_married <- histtext::search_concordance_ex('"married"',
corpus = "imh-en", context_size = 25,
filter_query = list(book = "游美同學錄"))
head(imh_eng_married)
imh_eng_married <- imh_eng_married %>%
mutate(married_year = str_extract_numbers(After)) %>% # extract year
mutate(married_year = as.character(married_year))
# discard useless variables before joining
married_eng <- imh_eng_married %>% select(DocId, Title, married_year)
head(married_eng)
# search the term "address" using concordance
imh_eng_address <- histtext::search_concordance_ex('"address"',
corpus = "imh-en", context_size = 150,
filter_query = list(book = "游美同學錄"))
imh_eng_address <- imh_eng_address %>%
mutate(address = str_remove_all(After,":|: |- ")) %>%
mutate(address = str_trim(address)) %>%
mutate(address = str_replace(address,"c/o","c/o ")) %>%
mutate(address = str_replace(address,"49Porland","49 Porland"))
# discard useless variables before joining
address_eng <- imh_eng_address %>% select(DocId, Title, address)
imh_eng_join_to_clean <- full_join(arrived_eng, returned_eng, by = "DocId")
## Warning in full_join(arrived_eng, returned_eng, by = "DocId"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 4 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
imh_eng_join_to_clean <- full_join(imh_eng_join_to_clean, married_eng, by = "DocId")
imh_eng_join_to_clean <- full_join(imh_eng_join_to_clean, funding_eng, by = "DocId")
imh_eng_join_to_clean <- full_join(imh_eng_join_to_clean, prepared_eng, by = "DocId")
imh_eng_join_to_clean <- full_join(imh_eng_join_to_clean, address_eng, by = "DocId")
For Named Entity Recognition (NER), we rely on the ner_on_corpus function included in the histtext R package.
# load packages
library(histtext)
library(tidyverse)
search_imh_zh <- histtext::search_documents_ex('*', corpus = "imh-zh", filter_query = list(book = "游美同學錄"))
# extract named entities
ner_imh17_zh <- ner_on_corpus(search_imh_zh, corpus = "imh-zh", only_precomputed = TRUE)
## 1/401
## 11/401
## 21/401
## 31/401
## 41/401
## 51/401
## 61/401
## 71/401
## 81/401
## 91/401
## 101/401
## 111/401
## 121/401
## 131/401
## 141/401
## 151/401
## 161/401
## 171/401
## 181/401
## 191/401
## 201/401
## 211/401
## 221/401
## 231/401
## 241/401
## 251/401
## 261/401
## 271/401
## 281/401
## 291/401
## 301/401
## 311/401
## 321/401
## 331/401
## 341/401
## 351/401
## 361/401
## 371/401
## 381/401
## 391/401
## 401/401
head(ner_imh17_zh)
# count entities types
ner_zh_count <- ner_imh17_zh %>% group_by(Type) %>% count(sort = TRUE)
ner_zh_count
# focus on organizations
ner_imh17_zh_org <- ner_imh17_zh %>% filter(Type == "ORG")
head(ner_imh17_zh_org)
# remove white spaces
ner_imh17_zh_org <- ner_imh17_zh_org %>%
mutate(Text_clean = str_replace(Text," ","")) %>%
relocate(Text_clean, .after = Text)
# remove Chinese punctuation
ner_imh17_zh_org <- ner_imh17_zh_org %>%
mutate(Text_clean = str_replace_all(Text_clean, "。", ""))
# remove non sinograms
ner_imh17_zh_org <- ner_imh17_zh_org %>%
mutate(Text_clean = str_replace_all(Text_clean, "<p>", "")) %>%
mutate(Text_clean = str_replace_all(Text_clean, "<", "")) %>%
mutate(Text_clean = str_replace_all(Text_clean, "[:digit:]", ""))
# substract first characters to identify verbs of action
ner_imh17_zh_org <- ner_imh17_zh_org %>%
mutate(pref1 = str_sub(Text_clean, 1, 1)) %>%
mutate(pref2 = str_sub(Text_clean, 1, 2)) %>%
relocate(pref1, .before = Text_clean) %>%
relocate(pref2, .after = pref1)
pref1 <- ner_imh17_zh_org %>% group_by(pref1) %>% count()
pref2 <- ner_imh17_zh_org %>% group_by(pref2) %>% count()
# remove verbs (任|入|伊|於|爲|充)
ner_imh17_zh_org <- ner_imh17_zh_org %>%
mutate(Text_clean = str_replace_all(Text_clean, "任|入|伊|於|爲|充", ""))
# count length
ner_imh17_zh_org <- ner_imh17_zh_org %>% mutate(length = nchar(Text_clean))
# substract last characters to classify organizations
ner_imh17_zh_org <- ner_imh17_zh_org %>%
mutate(Text_clean = str_remove_all(Text_clean, "長$")) %>%
mutate(suff1 = str_sub(Text_clean, - 1, - 1)) %>%
mutate(suff2 = str_sub(Text_clean, - 2, - 1)) %>%
relocate(suff1, .after = Text_clean) %>%
relocate(suff2, .after = suff1)
# export list of organization for further cleaning in Excel (not run)
# write.csv(ner_imh17_zh_org, "ner_imh17_zh_org.csv")
# export suffixes/prefixes to annotate and create ontologies of organizations and positions (not run)
suff1 <- ner_imh17_zh_org %>% group_by(suff1) %>% count()
suff2 <- ner_imh17_zh_org %>% group_by(suff2) %>% count()
# write.csv(suff1, "ner_suff1.csv")
# write.csv(suff2, "ner_suff2.csv")
# write.csv(pref1, "ner_pref1.csv")
# write.csv(pref2, "ner_pref2.csv")
For Named Entity Recognition (NER), we rely on the ner_on_corpus function included in the histtext R package.
# load packages
library(histtext)
library(tidyverse)
search_imh_en <- histtext::search_documents_ex('*', corpus = "imh-en", filter_query = list(book = "游美同學錄"))
# extract named entities
ner_imh17_en <- ner_on_corpus(search_imh_en, corpus = "imh-en", only_precomputed = TRUE)
## 1/401
## 11/401
## 21/401
## 31/401
## 41/401
## 51/401
## 61/401
## 71/401
## 81/401
## 91/401
## 101/401
## 111/401
## 121/401
## 131/401
## 141/401
## 151/401
## 161/401
## 171/401
## 181/401
## 191/401
## 201/401
## 211/401
## 221/401
## 231/401
## 241/401
## 251/401
## 261/401
## 271/401
## 281/401
## 291/401
## 301/401
## 311/401
## 321/401
## 331/401
## 341/401
## 351/401
## 361/401
## 371/401
## 381/401
## 391/401
## 401/401
head(ner_imh17_en)
# count entities types
ner_en_count <- ner_imh17_en %>% group_by(Type) %>% count(sort = TRUE)
ner_en_count
# focus on organizations
ner_imh17_en_org <- ner_imh17_en %>% filter(Type == "ORG")
head(ner_imh17_en_org)
imh17_eng_org <- ner_imh17_en_org %>%
mutate(Text_clean = str_replace(Text,"Anglo- Chinese College","Anglo-Chinese College")) %>%
relocate(Text_clean, .after = Text) %>%
mutate(Text_clean = str_replace(Text_clean,"Nanchang- Kiukiang","Nanchang-Kiukiang")) %>%
mutate(Text_clean = str_replace(Text_clean,"Nanking- Hunan","Nanking-Hunan")) %>%
mutate(Text_clean = str_replace(Text_clean,"Peking- Kalgan","Peking-Kalgan")) %>%
mutate(Text_clean = str_replace(Text_clean,"Shanghai- Nanking","Shanghai-Nanking")) %>%
mutate(Text_clean = str_replace(Text_clean,"Shanghai- Hangchow","Shanghai-Hangchow"))%>%
mutate(Text_clean = str_replace(Text_clean,"- ","")) %>%
mutate(Text_clean = str_replace(Text_clean,"Co\\.","Company")) %>%
mutate(Text_clean = str_replace(Text_clean,"20th","Twentieth")) %>%
mutate(Text_clean = str_replace_all(Text_clean, "<p>", "")) %>%
mutate(Text_clean = str_replace_all(Text_clean, "</p>", "")) %>%
mutate(Text_clean = str_replace_all(Text_clean, "[:digit:]", "")) %>%
mutate(Text_clean = str_replace(Text_clean,"The-Hua","Teh-Hua")) %>%
mutate(Text_clean = str_replace(Text_clean,"Inn,","Lincoln's Inn")) %>%
mutate(Text_clean = str_replace(Text_clean,"Yale,","Yale University")) %>%
mutate(Text_clean = str_replace(Text_clean,"Yale\\.","Yale University"))%>%
mutate(Text_clean = str_replace(Text_clean, "^the ", "")) %>%
mutate(Text_clean = str_replace(Text_clean, "^The ", "")) %>%
mutate(Text_clean = str_replace(Text_clean,"and Accounts;","Railway Finance and Accounts")) %>%
mutate(Text_clean = str_replace(Text_clean,"Fuh-Tan","Fu Tan")) %>%
mutate(Text_clean = str_replace(Text_clean,"Futan","Fu Tan")) %>%
mutate(Text_clean = str_replace(Text_clean,"Universities","University")) %>%
mutate(Text_clean = str_replace(Text_clean,"Railways","Railway")) %>%
mutate(Text_clean = str_replace(Text_clean,"& Company","& Company")) %>%
mutate(Text_clean = str_replace(Text_clean,"M.I.T\\.","Massachusetts Institute of Technology")) %>%
mutate(Text_clean = str_remove_all(Text_clean, "[\\p{P}\\p{S}&&[^-&'.]]"))%>%
mutate(Text_clean = str_replace(Text_clean, "\\.$", "")) %>%
mutate(Text_clean = str_replace(Text_clean,"Massachussetts","Massachusetts")) %>%
mutate(Text_clean = str_replace(Text_clean,"Massachusatts","Massachusetts")) %>%
mutate(Text_clean = str_replace(Text_clean,"Simga","Sigma"))
# remove NA
imh17_eng_org_no_na <- imh17_eng_org %>% filter(!is.na(Text_clean))
# count length
imh17_eng_org_no_na <- imh17_eng_org_no_na %>% mutate(length = nchar(Text_clean))
# export results for further cleaning in Excel (not run)
# write.csv(imh17_eng_org_no_na, "imh17_eng_org.csv")