Load packages

library(histtext)
library(tidyverse)
library(quanteda)
library(strex)
library(hrbrthemes)


Retrieve names of corpus and fields

histtext::list_filter_fields("imh-zh")
## [1] "book"   "bookno" "page"
book_imh <- histtext::list_possible_filters("imh-zh", "book")


Retrieve all entries in the Tsinghua directory

search_imh_zh <- histtext::search_documents_ex('*', corpus = "imh-zh", filter_query = list(book = "游美同學錄"))
search_imh_en <- histtext::search_documents_ex('*', corpus = "imh-en", filter_query = list(book = "游美同學錄"))


Convert row names into ID

search_imh_zh <- tibble::rowid_to_column(search_imh_zh, "ID")
search_imh_en <- tibble::rowid_to_column(search_imh_en, "ID")


Retrieve full text

imh17_zh_docs <- get_documents(search_imh_zh, corpus = "imh-zh", batch_size = 10, verbose = FALSE)
imh17_eng_docs <- get_documents(search_imh_en, corpus = "imh-en", batch_size = 10, verbose = FALSE)


Convert row names into ID

imh17_zh_docs <- tibble::rowid_to_column(imh17_zh_docs, "ID")
imh17_eng_docs <- tibble::rowid_to_column(imh17_eng_docs, "ID")


Compute length of biographies

imh17_zh_docs <- imh17_zh_docs %>% select(DocId, Text) %>% mutate(nchar_zh = nchar(Text))
imh17_eng_docs <- imh17_eng_docs %>% select(DocId, Text) %>% mutate(token_eng = ntoken(Text), char_eng = nchar(Text))


Extract page number

imh17_zh_docs <- imh17_zh_docs %>% mutate(page_zh = str_extract(Text, "<p>\\s*(.*?)\\s*</p>")) %>% 
  mutate(page_zh = str_remove_all(page_zh,"<p>")) %>%
  mutate(page_zh = str_remove_all(page_zh,"</p>"))

imh17_eng_docs <- imh17_eng_docs %>% mutate(page_eng = str_extract(Text, "<p>\\s*(.*?)\\s*</p>")) %>% 
  mutate(page_eng = str_remove_all(page_eng,"<p>")) %>%
  mutate(page_eng = str_remove_all(page_eng,"</p>"))


Join two tables

imh17_biling_metadata <- inner_join(imh17_zh_docs, imh17_eng_docs, by = "DocId")
imh17_biling_metadata$Text.x <- NULL
imh17_biling_metadata$Text.y <- NULL


Import and select student attributes

main <- read.csv("~/youmei-new/main.csv")
students_attributes <- main %>% select(DocId, gender, age_in_1917)


Join attributes with metadata

imh_17_doc_attributes <- inner_join(imh17_biling_metadata, students_attributes)
## Joining, by = "DocId"


Visualize correlation between length of text and age

ggplot(imh_17_doc_attributes, aes(x=age_in_1917, y=nchar_zh)) +
  geom_jitter() +
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  theme_ipsum() + 
  labs(title = "Age and length of biographies (Chinese)",
       x = "Age", 
       y = "Number of sinograms",
       caption = "遊美同學錄 (1917)") 
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 25 rows containing missing values (`geom_point()`).

ggplot(imh_17_doc_attributes, aes(x=age_in_1917, y=token_eng)) +
  geom_jitter() +
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  theme_ipsum() + 
  labs(title = "Age and length of biographies (English)",
       x = "Age", 
       y = "Number of words",
       caption = "Who's Who of American Returned Students (1917)") 
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Removed 25 rows containing missing values (`geom_point()`).


Gender and length

ggplot(imh_17_doc_attributes, aes(x=age_in_1917, y=nchar_zh, color = gender)) +
  geom_jitter() +
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  theme_ipsum() + 
  labs(title = "Length of biographies (Chinese)",
       x = "Age", 
       y = "Number of sinograms",
       color = "Gender",
       caption = "遊美同學錄 (1917)") 
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 25 rows containing missing values (`geom_point()`).

ggplot(imh_17_doc_attributes, aes(x=age_in_1917, y=token_eng, color = gender)) +
  geom_jitter() +
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  theme_ipsum() + 
  labs(title = "Length of biographies (English)",
       x = "Age", 
       y = "Number of words",
       color = "Gender",
       caption = "Who's Who of American Returned Students (1917)") 
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (`stat_smooth()`).
## Removed 25 rows containing missing values (`geom_point()`).


Analyze correlation between length of text and age (linear regression)

mod1 <- lm(nchar_zh~age_in_1917,data=imh_17_doc_attributes)
summary(mod1)
## 
## Call:
## lm(formula = nchar_zh ~ age_in_1917, data = imh_17_doc_attributes)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -144.84  -40.71   -6.23   35.47  318.47 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 175.4768    15.9724  10.986  < 2e-16 ***
## age_in_1917   1.2871     0.4759   2.705  0.00715 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 64.07 on 374 degrees of freedom
##   (25 observations effacées parce que manquantes)
## Multiple R-squared:  0.01918,    Adjusted R-squared:  0.01656 
## F-statistic: 7.315 on 1 and 374 DF,  p-value: 0.00715
mod2 <- lm(token_eng~age_in_1917,data=imh_17_doc_attributes)
summary(mod2)
## 
## Call:
## lm(formula = token_eng ~ age_in_1917, data = imh_17_doc_attributes)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -101.739  -35.730   -7.998   27.173  255.249 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 120.0451    12.5851   9.539   <2e-16 ***
## age_in_1917   0.5059     0.3750   1.349    0.178    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.48 on 374 degrees of freedom
##   (25 observations effacées parce que manquantes)
## Multiple R-squared:  0.004843,   Adjusted R-squared:  0.002182 
## F-statistic:  1.82 on 1 and 374 DF,  p-value: 0.1781


In both languages, the correlation between length and age is statistically significant (p-value >0.05). The older the individual, the longer his/her biographical record. English biographies increase in length by one word for each additional year of age, whereas Chinese biographies increase in length by 1.35 characters for each additional year of age.