Load packages
library(histtext)
library(tidyverse)
library(quanteda)
library(strex)
library(hrbrthemes)
Retrieve names of corpus and fields
histtext::list_filter_fields("imh-zh")
## [1] "book" "bookno" "page"
book_imh <- histtext::list_possible_filters("imh-zh", "book")
Retrieve all entries in the Tsinghua directory
search_imh_zh <- histtext::search_documents_ex('*', corpus = "imh-zh", filter_query = list(book = "游美同學錄"))
search_imh_en <- histtext::search_documents_ex('*', corpus = "imh-en", filter_query = list(book = "游美同學錄"))
Convert row names into ID
search_imh_zh <- tibble::rowid_to_column(search_imh_zh, "ID")
search_imh_en <- tibble::rowid_to_column(search_imh_en, "ID")
Retrieve full text
imh17_zh_docs <- get_documents(search_imh_zh, corpus = "imh-zh", batch_size = 10, verbose = FALSE)
imh17_eng_docs <- get_documents(search_imh_en, corpus = "imh-en", batch_size = 10, verbose = FALSE)
Convert row names into ID
imh17_zh_docs <- tibble::rowid_to_column(imh17_zh_docs, "ID")
imh17_eng_docs <- tibble::rowid_to_column(imh17_eng_docs, "ID")
Compute length of biographies
imh17_zh_docs <- imh17_zh_docs %>% select(DocId, Text) %>% mutate(nchar_zh = nchar(Text))
imh17_eng_docs <- imh17_eng_docs %>% select(DocId, Text) %>% mutate(token_eng = ntoken(Text), char_eng = nchar(Text))
Extract page number
imh17_zh_docs <- imh17_zh_docs %>% mutate(page_zh = str_extract(Text, "<p>\\s*(.*?)\\s*</p>")) %>%
mutate(page_zh = str_remove_all(page_zh,"<p>")) %>%
mutate(page_zh = str_remove_all(page_zh,"</p>"))
imh17_eng_docs <- imh17_eng_docs %>% mutate(page_eng = str_extract(Text, "<p>\\s*(.*?)\\s*</p>")) %>%
mutate(page_eng = str_remove_all(page_eng,"<p>")) %>%
mutate(page_eng = str_remove_all(page_eng,"</p>"))
Join two tables
imh17_biling_metadata <- inner_join(imh17_zh_docs, imh17_eng_docs, by = "DocId")
imh17_biling_metadata$Text.x <- NULL
imh17_biling_metadata$Text.y <- NULL
Import and select student attributes
main <- read.csv("~/youmei-new/main.csv")
students_attributes <- main %>% select(DocId, gender, age_in_1917)
Join attributes with metadata
imh_17_doc_attributes <- inner_join(imh17_biling_metadata, students_attributes)
## Joining, by = "DocId"
Visualize correlation between length of text and age
ggplot(imh_17_doc_attributes, aes(x=age_in_1917, y=nchar_zh)) +
geom_jitter() +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
theme_ipsum() +
labs(title = "Age and length of biographies (Chinese)",
x = "Age",
y = "Number of sinograms",
caption = "遊美同學錄 (1917)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 25 rows containing missing values (`geom_point()`).
ggplot(imh_17_doc_attributes, aes(x=age_in_1917, y=token_eng)) +
geom_jitter() +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
theme_ipsum() +
labs(title = "Age and length of biographies (English)",
x = "Age",
y = "Number of words",
caption = "Who's Who of American Returned Students (1917)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Removed 25 rows containing missing values (`geom_point()`).
Gender and length
ggplot(imh_17_doc_attributes, aes(x=age_in_1917, y=nchar_zh, color = gender)) +
geom_jitter() +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
theme_ipsum() +
labs(title = "Length of biographies (Chinese)",
x = "Age",
y = "Number of sinograms",
color = "Gender",
caption = "遊美同學錄 (1917)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 25 rows containing missing values (`geom_point()`).
ggplot(imh_17_doc_attributes, aes(x=age_in_1917, y=token_eng, color = gender)) +
geom_jitter() +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
theme_ipsum() +
labs(title = "Length of biographies (English)",
x = "Age",
y = "Number of words",
color = "Gender",
caption = "Who's Who of American Returned Students (1917)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Removed 25 rows containing missing values (`geom_point()`).
Analyze correlation between length of text and age (linear
regression)
mod1 <- lm(nchar_zh~age_in_1917,data=imh_17_doc_attributes)
summary(mod1)
##
## Call:
## lm(formula = nchar_zh ~ age_in_1917, data = imh_17_doc_attributes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -144.84 -40.71 -6.23 35.47 318.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 175.4768 15.9724 10.986 < 2e-16 ***
## age_in_1917 1.2871 0.4759 2.705 0.00715 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.07 on 374 degrees of freedom
## (25 observations effacées parce que manquantes)
## Multiple R-squared: 0.01918, Adjusted R-squared: 0.01656
## F-statistic: 7.315 on 1 and 374 DF, p-value: 0.00715
mod2 <- lm(token_eng~age_in_1917,data=imh_17_doc_attributes)
summary(mod2)
##
## Call:
## lm(formula = token_eng ~ age_in_1917, data = imh_17_doc_attributes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.739 -35.730 -7.998 27.173 255.249
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 120.0451 12.5851 9.539 <2e-16 ***
## age_in_1917 0.5059 0.3750 1.349 0.178
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.48 on 374 degrees of freedom
## (25 observations effacées parce que manquantes)
## Multiple R-squared: 0.004843, Adjusted R-squared: 0.002182
## F-statistic: 1.82 on 1 and 374 DF, p-value: 0.1781
In both languages, the correlation between length and age is
statistically significant (p-value >0.05). The older the individual,
the longer his/her biographical record. English biographies increase in
length by one word for each additional year of age, whereas Chinese
biographies increase in length by 1.35 characters for each additional
year of age.