Abstract
This document aims to provide a synthetic view of the press in early 1930s China through a multiple correspondence analysis (MCA) of the data extracted from Crow’s Newspaper Directory of China (1931-5).
For this experiment, we will rely on the complete dataset containing the 1060 periodicals listed in the two directories. In the preprocessing step, we re-factorize certain variables to enable multiple correspondence analysis. We lump together and recode the variables with rare values. In addition, we transform the continuous variables (e.g. circulation) into categorical ones. Finally, we attribute a unique id for each observation (periodical) and read the first column as row names. Since we will perform one MCA for each directory, we will split the original dataset into two samples, one for each directory.
Load original dataset containing 1060 periodicals and 47 variables:
library(readr)
crowdata <- read_delim("crowdata.csv", ";",
escape_double = FALSE, trim_ws = TRUE)
Select relevant variables for MCA:
crowdata <- crowdata %>% mutate(Title = paste(Title_eng, Title_zh, sep = " ")) %>%
mutate(Title = str_remove_all(Title,", The NA")) %>%
mutate(Title = str_remove_all(Title,", The")) %>%
mutate(Title = str_remove_all(Title," NA")) %>%
mutate(Title = str_remove_all(Title,", L'"))
crow_mca <- crowdata %>% select(Title, Year,
City_zh, Established,
Periodicity, Language,
Page_nbr_avrg, Page_surface, Col_nbr_avrg,
Circulation_avrg, Audited,
Publisher_type, Nationality)
names(crow_mca)
## [1] "Title" "Year" "City_zh" "Established"
## [5] "Periodicity" "Language" "Page_nbr_avrg" "Page_surface"
## [9] "Col_nbr_avrg" "Circulation_avrg" "Audited" "Publisher_type"
## [13] "Nationality"
We re-factorize the place of publication (city) according to the profile of the city and its geographical situation in China (the classification is described in detail in this document:
# load and prepare annotated data for joining
library(readr)
city_code <- read_delim("city_code.csv",
";", escape_double = FALSE, trim_ws = TRUE)
code <- city_code %>% select(City_zh, Profile, Location)
code <- code %>% mutate(code = paste0(Location, Profile))
place <- code %>% select(City_zh, Location, Profile)
# join
crow_mca <- right_join(crow_mca, place)
We re-factorize the year of establishment into four main periods (the periodization is described in the first essay):
crow_mca <- crow_mca %>% mutate(Founded=cut(Established,
breaks=c(0, 1903, 1916, 1927, 1935),
include.lowest = TRUE))
crow_mca <- crow_mca %>% mutate(Founded = fct_recode(Founded,
"1829-1903" = "[0,1.9e+03]",
"1904-1916" = "(1.9e+03,1.92e+03]",
"1917-1927" = "(1.92e+03,1.93e+03]",
"1928-1935" = "(1.93e+03,1.94e+03]"))
We re-factorize the periodicity:
crow_mca <- crow_mca %>%
mutate(Periodicity_fct = fct_collapse(Periodicity,
AQSM = c("Annual", "Quarterly", "Semi-monthly"),
BiWM = c("Biweekly", "Semi-weekly", "Bimonthly"),
Daily = c("Daily", "Tabloid"),
Weekly = c("Weekly"),
Monthly = c("Monthly")))
We re-factorize the language of publication:
crow_mca <- crow_mca %>%
mutate(Language_fct = fct_collapse(Language,
Other = c("German", "French", "Russian", "Bilingual"),
English = c("English"),
Japanese = c("Japanese"),
Chinese = c("Chinese")))
We transform the circulation figures into a categorical variable with 6 thresholds:
crow_mca <- crow_mca %>% mutate(Circulation_thres=cut(Circulation_avrg,
breaks=c(0, 500, 1000, 2500, 5000, 10000, 50000, 150000),
include.lowest = TRUE))
We re-factorize the publisher’s profile and nationality (the classification is described in this document):
crow_mca <- crow_mca %>%
mutate(Publisher_fct = fct_collapse(Publisher_type,
Other = c("Army", "Church", "Hospital"),
Newsgroup = c("Newsgroup"),
Newspaper = c("Newspaper"),
"Publishing house" = c("Publishing house"),
Organization = c("Association", "Business organization"),
Private = c("Company", "Person"),
Official = c("Administration", "Political party"),
Academic = c("University")))
crow_mca <- crow_mca %>%
mutate(Nationality_fct = fct_collapse(Nationality,
Other = c("Foreign", "Japanese", "German", "Russian", "Irish"),
American = c("American", "Chinese-American"),
British = c("British"),
Chinese = c("Chinese")))
We transform the continuous variables related to the format and layout of periodicals into categorical ones (these variables are described in detail this document):
# number of pages
crow_mca <- crow_mca %>% mutate(Page_nbr_thres=cut(Page_nbr_avrg,
breaks=c(0, 10, 25, 50, 100, 500, 1200),
include.lowest = TRUE))
# Page size
crow_mca <- crow_mca %>%
mutate(PageSize=cut(Page_surface, breaks=c(0, 100, 200, 400),
labels=c("Octavo","Compact","Broadsheet")))
# Layout
crow_mca <- crow_mca %>% mutate(Col_nbr_thres=cut(Col_nbr_avrg,
breaks=c(0, 3, 5, 8, 10, 13),
include.lowest = TRUE))
Finally, we re-select and rename the relevant variables to create a clean dataset:
crow_mca <- crow_mca %>% select(Title, Year, Periodicity_fct, Language_fct, Location, Profile, Founded, Publisher_fct, Nationality_fct, Audited, Circulation_thres, Page_nbr_thres, PageSize, Col_nbr_thres)
crow_mca <- crow_mca %>% rename(Periodicity = Periodicity_fct, Language = Language_fct,
Province = Location, City = Profile,
Publisher = Publisher_fct, Nationality = Nationality_fct,
Circulation = Circulation_thres,
PageNbr = Page_nbr_thres,
ColNbr = Col_nbr_thres)
We now have the complete dataset with all periodicals including those with missing data.
We create a unique id for each observation (periodical):
crow_mca_id <- rowid_to_column(crow_mca)
crow_mca_id <- crow_mca_id %>% mutate(id = paste(rowid, Title, sep = "."))
crow_mca_id <- crow_mca_id %>% select(id, Year, Periodicity, Language, Province, City, Founded, Publisher, Nationality, Audited, Circulation, PageNbr, PageSize, ColNbr)
We create one subset for each directory:
crow_mca1931 <- crow_mca %>% filter(Year== "1931")
crow_mca1935 <- crow_mca %>% filter(Year== "1935")
crow_mca_id1931 <- crow_mca_id %>% filter(Year== "1931") # 360
crow_mca_id1935 <- crow_mca_id %>% filter(Year== "1935") # 703
We read the first column as row names:
#1931
crow_mca_id1931_tbl <- column_to_rownames(crow_mca_id1931, var = "id")
crowmca31 <- crow_mca_id1931_tbl %>% select(Periodicity, Language, Province,
Founded, Publisher, Nationality, Audited, Circulation, PageNbr, PageSize, ColNbr)
#1935
crow_mca_id1935_tbl <- column_to_rownames(crow_mca_id1935, var = "id")
crowmca35 <- crow_mca_id1935_tbl %>% select(Periodicity, Language, Province,
Founded, Publisher, Nationality, Audited, Circulation, PageNbr, PageSize, ColNbr)
We’re all set!
Load packages
library(FactoMineR)
library(Factoshiny)
library(explor)
We first perform a MCA on the dataset from the 1931 directory. The dataset contains 360 periodicals and 11 variables. We transform all variables into factor variables as required by MCA. We consider all variables as active variables, except for the place of publication (Province) and the period of founding (Founded), which we set as supplementary variables.
crowmca31fct <- crowmca31 %>% mutate_all(factor)
res_acm31 <- MCA(crowmca31fct, quali.sup = 3:4)
res_acm31
## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 360 individuals, described by 11 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. of the categories"
## 4 "$var$cos2" "cos2 for the categories"
## 5 "$var$contrib" "contributions of the categories"
## 6 "$var$v.test" "v-test for the categories"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$quali.sup" "results for the supplementary categorical variables"
## 12 "$quali.sup$coord" "coord. for the supplementary categories"
## 13 "$quali.sup$cos2" "cos2 for the supplementary categories"
## 14 "$quali.sup$v.test" "v-test for the supplementary categories"
## 15 "$call" "intermediate results"
## 16 "$call$marge.col" "weights of columns"
## 17 "$call$marge.li" "weights of rows"
Dimensions description
variances31 <- as.data.frame(res_acm31$eig) %>%
rownames_to_column() %>% # retrieve row names (dim 1, dim 2, etc) in a distinct column
slice(1:10) %>% # retain information on the 10 first dimensions only
mutate(Axis = str_replace_all(rowname, "dim", "Axis")) %>% # create a new variable from rowname, with values "Axis 1, Axis 2, etc. instead of "dim 1, dim 2.." etc)
select(-rowname) %>% # remove this column since we don't need it anymore
mutate(Axis = fct_relevel(Axis, paste("Axis", 1:10))) %>% # to ensure the dimensions are ordered correctly on the graphs
select(Axis, `eigenvalue`, `percentage of variance`, `cumulative percentage of variance`)
variances31 %>%
flextable() %>%
colformat_double(decimal.mark = ",", digits = 2) %>%
autofit() %>%
set_caption("Eigenvalue & variances along axes")
Axis | eigenvalue | percentage of variance | cumulative percentage of variance |
Axis 1 | 0,56 | 11,92 | 11,92 |
Axis 2 | 0,34 | 7,23 | 19,15 |
Axis 3 | 0,29 | 6,11 | 25,26 |
Axis 4 | 0,25 | 5,45 | 30,71 |
Axis 5 | 0,20 | 4,25 | 34,96 |
Axis 6 | 0,19 | 4,01 | 38,97 |
Axis 7 | 0,17 | 3,73 | 42,70 |
Axis 8 | 0,17 | 3,66 | 46,36 |
Axis 9 | 0,16 | 3,36 | 49,72 |
Axis 10 | 0,14 | 3,10 | 52,82 |
# pour exporter ce tableau, plusieurs solutions possibles, notamment :
write_csv2(variances31, "variances31.csv")
Variance
ggplot(variances31, aes(x = Axis)) +
geom_bar(aes(y = `percentage of variance`),
stat = "identity",
fill = "red") +
xlab("") +
ylab("% of variance") +
theme_minimal()
Cumulative variance
ggplot(variances31, aes(x = Axis)) +
geom_bar(aes(y = `cumulative percentage of variance`),
stat = "identity",
fill = "blue") +
xlab("") +
ylab("cumulative percentage of variance") +
theme_minimal()
Contribution threshold
threshold31 <- 100 / nrow(res_acm31$var$coord)
Variables frequency
frequences31 <- crowmca31fct %>%
pivot_longer(everything(),
names_to = "variables",
values_to = "categories") %>% # count all occurrences of variable/category
count(variables, categories) %>% # count only unique pairs of variable/category (i.e. number of unique individuals for each category)
group_by(variables) %>%
mutate(percentage = round(100 * n / nrow(crowmca31fct), 1)) %>% # compute percentage for each group of variables
ungroup() %>%
select(variables, categories, n, percentage) # re-order variables
frequences31 %>%
flextable() %>%
colformat_double(decimal.mark = ",", digits = 1) %>%
autofit() %>%
set_caption("Variables Frequency on Crow 1931 dataset")
variables | categories | n | percentage |
Audited | Audited | 8 | 2,2 |
Audited | Publisher's statement | 296 | 82,2 |
Audited | 56 | 15,6 | |
Circulation | [0,500] | 23 | 6,4 |
Circulation | (500,1e+03] | 40 | 11,1 |
Circulation | (1e+03,2.5e+03] | 74 | 20,6 |
Circulation | (2.5e+03,5e+03] | 83 | 23,1 |
Circulation | (5e+03,1e+04] | 47 | 13,1 |
Circulation | (1e+04,5e+04] | 34 | 9,4 |
Circulation | (5e+04,1.5e+05] | 3 | 0,8 |
Circulation | 56 | 15,6 | |
ColNbr | [0,3] | 20 | 5,6 |
ColNbr | (3,5] | 33 | 9,2 |
ColNbr | (5,8] | 150 | 41,7 |
ColNbr | (8,10] | 75 | 20,8 |
ColNbr | (10,13] | 19 | 5,3 |
ColNbr | 63 | 17,5 | |
Founded | 1829-1903 | 17 | 4,7 |
Founded | 1904-1916 | 45 | 12,5 |
Founded | 1917-1927 | 117 | 32,5 |
Founded | 1928-1935 | 148 | 41,1 |
Founded | 33 | 9,2 | |
Language | Other | 10 | 2,8 |
Language | Chinese | 295 | 81,9 |
Language | English | 49 | 13,6 |
Language | Japanese | 6 | 1,7 |
Nationality | Other | 9 | 2,5 |
Nationality | Chinese | 62 | 17,2 |
Nationality | American | 11 | 3,1 |
Nationality | British | 26 | 7,2 |
Nationality | 252 | 70,0 | |
PageNbr | [0,10] | 215 | 59,7 |
PageNbr | (10,25] | 66 | 18,3 |
PageNbr | (25,50] | 29 | 8,1 |
PageNbr | (50,100] | 26 | 7,2 |
PageNbr | (100,500] | 17 | 4,7 |
PageNbr | (500,1.2e+03] | 4 | 1,1 |
PageNbr | 3 | 0,8 | |
PageSize | Octavo | 79 | 21,9 |
PageSize | Compact | 72 | 20,0 |
PageSize | Broadsheet | 205 | 56,9 |
PageSize | 4 | 1,1 | |
Periodicity | AQSM | 11 | 3,1 |
Periodicity | BiWM | 5 | 1,4 |
Periodicity | Daily | 242 | 67,2 |
Periodicity | Monthly | 46 | 12,8 |
Periodicity | Weekly | 56 | 15,6 |
Province | CT | 10 | 2,8 |
Province | NNE | 94 | 26,1 |
Province | NW | 2 | 0,6 |
Province | SE | 225 | 62,5 |
Province | SW | 29 | 8,1 |
Publisher | Other | 3 | 0,8 |
Publisher | Official | 5 | 1,4 |
Publisher | Organization | 20 | 5,6 |
Publisher | Private | 17 | 4,7 |
Publisher | Newsgroup | 20 | 5,6 |
Publisher | Newspaper | 14 | 3,9 |
Publisher | Publishing house | 24 | 6,7 |
Publisher | Academic | 5 | 1,4 |
Publisher | 252 | 70,0 |
write_csv2(frequences31, "frequences31.csv")
** Active variables**
# Coordinates
coordinates31 <- as_tibble(res_acm31$var$coord,
rownames = "categories") %>% # retrieve coordinates
mutate_if(is.numeric, round, digits = 2) %>% # retain just 2 digits
rename_all(tolower) %>% # remove upper case characters
rename_all(~ str_replace(., " ", "")) %>% # remove white spaces
rename_if(is.numeric, ~ str_c(., "coord", sep = "_")) # add suffix _coord to variable name
# Contributions
contributions31 <- as_tibble(res_acm31$var$contrib,
rownames = "categories") %>% # retrieve contributions
mutate_if(is.numeric, round, digits = 2) %>% # retain just 2 digits
rename_all(tolower) %>% # remove upper case characters
rename_all(~ str_replace(., " ", "")) %>% # remove white spaces
rename_if(is.numeric, ~ str_c(., "contrib", sep = "_")) # add suffix _contrib to variable name
# Square cosine
cos2_31 <- as_tibble(res_acm31$var$cos2,
rownames = "categories") %>% # retrieve cos2
mutate_if(is.numeric, round, digits = 2) %>% # retain just 2 digits
rename_all(tolower) %>% # remove upper case
rename_all(~ str_replace(., " ", "")) %>% # remove white spaces
rename_if(is.numeric, ~ str_c(., "cos2", sep = "_")) # add suffix _cos2
# vtest
vtest31 <- as_tibble(res_acm31$var$v.test,
rownames = "categories") %>% # retrieve v.test values
mutate_if(is.numeric, round, digits = 2) %>% # retain just 2 digits
rename_all(tolower) %>% # remove upper case
rename_all(~ str_replace(., " ", "")) %>% # remove white spaces
rename_if(is.numeric, ~ str_c(., "vtest", sep = "_")) # add suffix _vtest
# Re-assemble results
results_active31 <- frequences31 %>%
right_join(coordinates31) %>%
right_join(contributions31) %>%
right_join(cos2_31) %>%
right_join(vtest31) %>% # merge data using "categories"
mutate(type = "Active") %>% # add column to specify the type of variable (active)
select(type, variables, categories, n, percentage,
contains("dim1"), contains("dim2"),
contains("dim3"), contains("dim4")) # reorder variables according to their relevance for each axis
results_active31 %>%
flextable() %>%
colformat_double(decimal.mark = ",", digits = 1) %>%
autofit() %>%
set_caption("Active Variables: Statistical Results")
type | variables | categories | n | percentage | dim1_coord | dim1_contrib | dim1_cos2 | dim1_vtest | dim2_coord | dim2_contrib | dim2_cos2 | dim2_vtest | dim3_coord | dim3_contrib | dim3_cos2 | dim3_vtest | dim4_coord | dim4_contrib | dim4_cos2 | dim4_vtest |
Active | Audited | Audited | 8 | 2,2 | 0,9 | 0,4 | 0,0 | 2,6 | 1,9 | 2,5 | 0,1 | 5,3 | -2,0 | 3,3 | 0,1 | -5,6 | 1,8 | 3,1 | 0,1 | 5,1 |
Active | Audited | Publisher's statement | 296 | 82,2 | 0,0 | 0,0 | 0,0 | 1,8 | -0,2 | 1,6 | 0,3 | -9,9 | -0,2 | 1,3 | 0,2 | -8,2 | -0,0 | 0,1 | 0,0 | -1,5 |
Active | Circulation | [0,500] | 23 | 6,4 | 0,6 | 0,5 | 0,0 | 3,0 | 0,2 | 0,1 | 0,0 | 0,8 | -0,9 | 1,9 | 0,1 | -4,3 | -1,2 | 3,8 | 0,1 | -5,7 |
Active | Circulation | (500,1e+03] | 40 | 11,1 | -0,0 | 0,0 | 0,0 | -0,2 | -0,3 | 0,3 | 0,0 | -1,9 | -0,4 | 0,6 | 0,0 | -2,6 | -0,5 | 1,1 | 0,0 | -3,1 |
Active | Circulation | (1e+03,2.5e+03] | 74 | 20,6 | 0,0 | 0,0 | 0,0 | 0,1 | 0,0 | 0,0 | 0,0 | 0,3 | -0,3 | 0,8 | 0,0 | -3,1 | 0,1 | 0,0 | 0,0 | 0,7 |
Active | Circulation | (2.5e+03,5e+03] | 83 | 23,1 | 0,1 | 0,0 | 0,0 | 0,6 | -0,3 | 0,5 | 0,0 | -2,7 | -0,1 | 0,1 | 0,0 | -0,9 | 0,0 | 0,0 | 0,0 | 0,2 |
Active | Circulation | (5e+03,1e+04] | 47 | 13,1 | -0,1 | 0,0 | 0,0 | -0,4 | -0,1 | 0,1 | 0,0 | -0,9 | -0,2 | 0,3 | 0,0 | -1,9 | 0,3 | 0,6 | 0,0 | 2,4 |
Active | Circulation | (1e+04,5e+04] | 34 | 9,4 | 0,1 | 0,0 | 0,0 | 0,7 | -0,7 | 1,6 | 0,1 | -4,4 | 0,2 | 0,2 | 0,0 | 1,3 | 0,7 | 1,9 | 0,1 | 4,1 |
Active | Circulation | (5e+04,1.5e+05] | 3 | 0,8 | 0,2 | 0,0 | 0,0 | 0,4 | 0,3 | 0,0 | 0,0 | 0,6 | -1,4 | 0,7 | 0,0 | -2,5 | 1,2 | 0,5 | 0,0 | 2,1 |
Active | ColNbr | [0,3] | 20 | 5,6 | 1,4 | 2,3 | 0,1 | 6,6 | 1,2 | 2,5 | 0,1 | 5,3 | -1,4 | 4,5 | 0,1 | -6,6 | -0,4 | 0,3 | 0,0 | -1,7 |
Active | ColNbr | (3,5] | 33 | 9,2 | -0,3 | 0,2 | 0,0 | -2,0 | 0,1 | 0,1 | 0,0 | 0,9 | -0,4 | 0,7 | 0,0 | -2,6 | -0,5 | 0,9 | 0,0 | -2,9 |
Active | ColNbr | (5,8] | 150 | 41,7 | -0,4 | 1,6 | 0,1 | -7,1 | 0,2 | 0,4 | 0,0 | 2,7 | -0,2 | 0,8 | 0,0 | -3,6 | -0,1 | 0,2 | 0,0 | -1,7 |
Active | ColNbr | (8,10] | 75 | 20,8 | -0,7 | 1,8 | 0,1 | -6,3 | -0,2 | 0,3 | 0,0 | -2,1 | 0,3 | 0,7 | 0,0 | 2,7 | 0,1 | 0,2 | 0,0 | 1,3 |
Active | ColNbr | (10,13] | 19 | 5,3 | -0,6 | 0,3 | 0,0 | -2,5 | 0,3 | 0,1 | 0,0 | 1,2 | 0,9 | 1,6 | 0,0 | 4,0 | 2,0 | 9,6 | 0,2 | 9,1 |
Active | PageNbr | [0,10] | 215 | 59,7 | -0,6 | 3,8 | 0,5 | -12,9 | -0,1 | 0,3 | 0,0 | -3,0 | -0,1 | 0,2 | 0,0 | -2,1 | -0,3 | 2,4 | 0,1 | -7,0 |
Active | PageNbr | (10,25] | 66 | 18,3 | -0,2 | 0,2 | 0,0 | -1,9 | 0,5 | 1,6 | 0,1 | 4,7 | 0,1 | 0,0 | 0,0 | 0,7 | 1,1 | 10,5 | 0,3 | 10,3 |
Active | PageNbr | (25,50] | 29 | 8,1 | 1,6 | 3,9 | 0,2 | 8,7 | 0,7 | 1,4 | 0,0 | 4,0 | -1,3 | 5,5 | 0,1 | -7,4 | 0,3 | 0,3 | 0,0 | 1,8 |
Active | PageNbr | (50,100] | 26 | 7,2 | 1,8 | 4,6 | 0,2 | 9,4 | -0,5 | 0,6 | 0,0 | -2,5 | 0,7 | 1,3 | 0,0 | 3,6 | -0,5 | 0,7 | 0,0 | -2,5 |
Active | PageNbr | (100,500] | 17 | 4,7 | 1,9 | 3,5 | 0,2 | 8,1 | -1,7 | 4,7 | 0,1 | -7,3 | 1,0 | 1,7 | 0,1 | 4,0 | 0,3 | 0,2 | 0,0 | 1,4 |
Active | PageNbr | (500,1.2e+03] | 4 | 1,1 | 1,8 | 0,7 | 0,0 | 3,5 | 0,0 | 0,0 | 0,0 | 0,0 | 0,9 | 0,4 | 0,0 | 1,9 | -0,3 | 0,0 | 0,0 | -0,6 |
Active | PageSize | Octavo | 79 | 21,9 | 1,7 | 12,4 | 0,8 | 16,9 | -0,4 | 1,0 | 0,0 | -3,8 | 0,1 | 0,0 | 0,0 | 0,7 | -0,1 | 0,1 | 0,0 | -0,7 |
Active | PageSize | Compact | 72 | 20,0 | -0,4 | 0,6 | 0,0 | -3,6 | 0,1 | 0,0 | 0,0 | 0,5 | -0,6 | 2,7 | 0,1 | -5,6 | -0,8 | 4,9 | 0,1 | -7,1 |
Active | PageSize | Broadsheet | 205 | 56,9 | -0,5 | 3,2 | 0,4 | -11,6 | 0,1 | 0,1 | 0,0 | 1,1 | 0,1 | 0,2 | 0,0 | 2,2 | 0,3 | 3,1 | 0,2 | 7,7 |
Active | Periodicity | AQSM | 11 | 3,1 | 2,1 | 2,7 | 0,1 | 7,1 | 1,8 | 3,4 | 0,1 | 6,2 | 2,3 | 6,2 | 0,2 | 7,7 | -1,4 | 2,6 | 0,1 | -4,7 |
Active | Periodicity | BiWM | 5 | 1,4 | -0,1 | 0,0 | 0,0 | -0,3 | -0,8 | 0,3 | 0,0 | -1,8 | -0,2 | 0,0 | 0,0 | -0,4 | -0,7 | 0,3 | 0,0 | -1,6 |
Active | Periodicity | Daily | 242 | 67,2 | -0,6 | 4,0 | 0,6 | -14,9 | 0,0 | 0,0 | 0,0 | 0,8 | 0,1 | 0,2 | 0,0 | 2,2 | 0,2 | 1,3 | 0,1 | 5,7 |
Active | Periodicity | Monthly | 46 | 12,8 | 1,8 | 8,4 | 0,5 | 13,2 | -1,1 | 5,0 | 0,2 | -7,9 | 0,4 | 0,7 | 0,0 | 2,7 | 0,2 | 0,2 | 0,0 | 1,2 |
Active | Periodicity | Weekly | 56 | 15,6 | 0,5 | 0,7 | 0,0 | 3,9 | 0,5 | 1,2 | 0,0 | 3,9 | -1,1 | 7,2 | 0,2 | -8,9 | -0,7 | 3,4 | 0,1 | -5,7 |
Active | Language_Chinese | -0,2 | 0,7 | 0,2 | -8,6 | -0,3 | 2,4 | 0,4 | -12,0 | 0,1 | 0,1 | 0,0 | 2,7 | -0,1 | 0,7 | 0,1 | -5,5 | |||
Active | Language_English | 1,3 | 4,9 | 0,3 | 10,1 | 1,5 | 10,6 | 0,4 | 11,6 | -0,7 | 2,7 | 0,1 | -5,4 | 0,2 | 0,3 | 0,0 | 1,6 | |||
Active | Language_Japanese | -0,7 | 0,2 | 0,0 | -1,8 | 1,0 | 0,6 | 0,0 | 2,5 | 1,5 | 1,5 | 0,0 | 3,7 | 3,7 | 9,8 | 0,2 | 9,1 | |||
Active | Language_Other | 0,1 | 0,0 | 0,0 | 0,4 | 0,6 | 0,3 | 0,0 | 1,9 | 0,6 | 0,4 | 0,0 | 2,0 | 0,7 | 0,6 | 0,0 | 2,3 | |||
Active | Publisher_Academic | 2,2 | 1,3 | 0,1 | 4,9 | 0,8 | 0,3 | 0,0 | 1,8 | 2,2 | 2,6 | 0,1 | 5,0 | -0,9 | 0,4 | 0,0 | -1,9 | |||
Active | Publisher_Newsgroup | 0,8 | 0,7 | 0,0 | 3,7 | 1,3 | 3,2 | 0,1 | 6,1 | -1,5 | 4,7 | 0,1 | -6,8 | 0,8 | 1,5 | 0,0 | 3,6 | |||
Active | Publisher_Newspaper | -0,0 | 0,0 | 0,0 | -0,1 | 0,9 | 1,0 | 0,0 | 3,3 | 0,4 | 0,2 | 0,0 | 1,5 | 2,4 | 9,4 | 0,2 | 9,0 | |||
Active | Publisher_Official | 1,0 | 0,3 | 0,0 | 2,3 | -1,5 | 1,1 | 0,0 | -3,5 | 0,6 | 0,2 | 0,0 | 1,4 | 0,4 | 0,1 | 0,0 | 0,9 | |||
Active | Publisher_Organization | 1,4 | 2,1 | 0,1 | 6,3 | -1,1 | 2,1 | 0,1 | -5,0 | 0,5 | 0,5 | 0,0 | 2,1 | 0,2 | 0,1 | 0,0 | 0,8 | |||
Active | Publisher_Other | 2,2 | 0,8 | 0,0 | 3,9 | 1,2 | 0,4 | 0,0 | 2,1 | -1,6 | 0,8 | 0,0 | -2,8 | 0,3 | 0,0 | 0,0 | 0,5 | |||
Active | Publisher_Private | 1,1 | 1,2 | 0,1 | 4,7 | 1,4 | 3,2 | 0,1 | 6,0 | 0,3 | 0,2 | 0,0 | 1,5 | -0,4 | 0,3 | 0,0 | -1,7 | |||
Active | Publisher_Publishing house | 1,8 | 4,3 | 0,2 | 9,1 | -0,6 | 0,7 | 0,0 | -2,8 | -0,0 | 0,0 | 0,0 | -0,2 | 0,5 | 0,8 | 0,0 | 2,6 | |||
Active | Publisher.NA | -0,5 | 3,6 | 0,6 | -14,7 | -0,1 | 0,3 | 0,0 | -3,2 | 0,0 | 0,0 | 0,0 | 0,1 | -0,2 | 1,5 | 0,1 | -6,4 | |||
Active | Nationality_American | 1,5 | 1,4 | 0,1 | 5,0 | 1,9 | 3,8 | 0,1 | 6,5 | 0,0 | 0,0 | 0,0 | 0,0 | 0,1 | 0,0 | 0,0 | 0,2 | |||
Active | Nationality_British | 1,4 | 2,6 | 0,1 | 7,1 | 1,8 | 7,6 | 0,2 | 9,4 | -0,8 | 1,6 | 0,1 | -4,0 | -0,1 | 0,0 | 0,0 | -0,6 | |||
Active | Nationality_Chinese | 1,2 | 4,9 | 0,3 | 10,3 | -0,9 | 4,5 | 0,2 | -7,7 | 0,3 | 0,8 | 0,0 | 3,0 | 0,6 | 2,3 | 0,1 | 4,8 | |||
Active | Nationality_Other | 0,3 | 0,0 | 0,0 | 0,9 | 1,7 | 2,4 | 0,1 | 5,2 | -0,2 | 0,1 | 0,0 | -0,7 | 2,7 | 7,8 | 0,2 | 8,1 | |||
Active | Nationality.NA | -0,5 | 3,6 | 0,6 | -14,7 | -0,1 | 0,3 | 0,0 | -3,2 | 0,0 | 0,0 | 0,0 | 0,1 | -0,2 | 1,5 | 0,1 | -6,4 | |||
Active | Audited.NA | -0,4 | 0,4 | 0,0 | -3,0 | 1,0 | 5,3 | 0,2 | 8,2 | 1,3 | 10,9 | 0,3 | 10,9 | -0,1 | 0,0 | 0,0 | -0,5 | |||
Active | Circulation.NA | -0,4 | 0,4 | 0,0 | -3,0 | 1,0 | 5,3 | 0,2 | 8,2 | 1,3 | 10,9 | 0,3 | 10,9 | -0,1 | 0,0 | 0,0 | -0,5 | |||
Active | PageNbr.NA | 1,2 | 0,2 | 0,0 | 2,1 | 4,9 | 6,7 | 0,2 | 8,6 | 5,0 | 8,0 | 0,2 | 8,6 | -3,8 | 5,2 | 0,1 | -6,6 | |||
Active | PageSize.NA | 0,9 | 0,2 | 0,0 | 1,7 | 4,0 | 5,9 | 0,2 | 8,0 | 4,1 | 7,2 | 0,2 | 8,2 | -3,1 | 4,8 | 0,1 | -6,3 | |||
Active | ColNbr.NA | 1,7 | 10,3 | 0,6 | 15,0 | -0,7 | 2,7 | 0,1 | -5,9 | 0,6 | 2,6 | 0,1 | 5,4 | -0,1 | 0,2 | 0,0 | -1,3 |
write_csv2(results_active31, "results_active31.csv")
Supplementary variables
# Coordinates
coord_sup31 <- as_tibble(res_acm31$quali.sup$coord,
rownames = "categories") %>%
mutate_if(is.numeric, round, digits = 2) %>%
rename_all(tolower) %>%
rename_all(~ str_replace(., " ", "")) %>%
rename_if(is.numeric, ~ str_c(., "coord", sep = "_"))
# Cosinus carrés
cos2_sup31 <- as_tibble(res_acm31$quali.sup$cos2,
rownames = "categories") %>%
mutate_if(is.numeric, round, digits = 2) %>%
rename_all(tolower) %>%
rename_all(~ str_replace(., " ", "")) %>%
rename_if(is.numeric, ~ str_c(., "cos2", sep = "_"))
# vtest
vtest_sup31 <- as_tibble(res_acm31$quali.sup$v.test,
rownames = "categories") %>%
mutate_if(is.numeric, round, digits = 2) %>%
rename_all(tolower) %>%
rename_all(~ str_replace(., " ", "")) %>%
rename_if(is.numeric, ~ str_c(., "vtest", sep = "_"))
# Reassemble
results_sup31 <- frequences31 %>%
right_join(coord_sup31) %>%
right_join(cos2_sup31) %>%
right_join(vtest_sup31) %>%
mutate(type = "Supplementary") %>%
select(type, variables, categories, n, percentage,
contains("dim1"), contains("dim2"),
contains("dim3"), contains("dim4"))
results_sup31 %>%
flextable() %>%
colformat_double(decimal.mark = ",", digits = 1) %>%
autofit() %>%
set_caption("Supplementary Variables: Statistical Results")
type | variables | categories | n | percentage | dim1_coord | dim1_cos2 | dim1_vtest | dim2_coord | dim2_cos2 | dim2_vtest | dim3_coord | dim3_cos2 | dim3_vtest | dim4_coord | dim4_cos2 | dim4_vtest |
Supplementary | Founded | 1829-1903 | 17 | 4,7 | 0,8 | 0,0 | 3,5 | 0,8 | 0,0 | 3,5 | -0,7 | 0,0 | -3,1 | 1,0 | 0,1 | 4,2 |
Supplementary | Founded | 1904-1916 | 45 | 12,5 | -0,2 | 0,0 | -1,8 | 0,0 | 0,0 | 0,1 | 0,3 | 0,0 | 2,4 | 0,3 | 0,0 | 2,5 |
Supplementary | Founded | 1917-1927 | 117 | 32,5 | -0,2 | 0,0 | -2,9 | -0,1 | 0,0 | -1,0 | 0,1 | 0,0 | 1,0 | -0,0 | 0,0 | -0,1 |
Supplementary | Founded | 1928-1935 | 148 | 41,1 | -0,1 | 0,0 | -1,4 | -0,2 | 0,0 | -2,7 | -0,1 | 0,0 | -1,3 | -0,3 | 0,1 | -5,0 |
Supplementary | Province | CT | 10 | 2,8 | -0,6 | 0,0 | -1,9 | -0,3 | 0,0 | -1,0 | -0,1 | 0,0 | -0,4 | -0,0 | 0,0 | -0,1 |
Supplementary | Province | NNE | 94 | 26,1 | -0,3 | 0,0 | -4,0 | -0,1 | 0,0 | -0,6 | 0,2 | 0,0 | 2,5 | 0,1 | 0,0 | 1,2 |
Supplementary | Province | NW | 2 | 0,6 | -0,6 | 0,0 | -0,9 | -0,4 | 0,0 | -0,5 | -0,1 | 0,0 | -0,1 | -0,1 | 0,0 | -0,1 |
Supplementary | Province | SE | 225 | 62,5 | 0,2 | 0,1 | 5,7 | 0,1 | 0,0 | 2,1 | -0,1 | 0,0 | -1,6 | -0,0 | 0,0 | -0,5 |
Supplementary | Province | SW | 29 | 8,1 | -0,4 | 0,0 | -2,3 | -0,4 | 0,0 | -2,1 | -0,2 | 0,0 | -0,9 | -0,2 | 0,0 | -1,1 |
Supplementary | Founded.NA | 1,1 | 0,1 | 6,5 | 0,6 | 0,0 | 3,7 | 0,0 | 0,0 | 0,1 | 0,4 | 0,0 | 2,6 |
write_csv2(results_sup31, "results_sup31.csv")
Assemble active and supplementary variables
results_all31 <- bind_rows(results_active31, results_sup31)
write_csv2(results_all31, "results_all31.csv")
Point cloud of active variables (size proportionate to importance of variable)
results_active31 %>%
filter(dim1_contrib > threshold31 |
dim2_contrib > threshold31) %>%
ggplot(aes(x = dim1_coord, y = dim2_coord,
label = categories,
shape = variables,
size = n)) +
geom_point() +
coord_fixed() +
geom_text_repel(size = 3, segment.alpha = 0.5) +
geom_hline(yintercept = 0, colour = "darkgrey", linetype="longdash") +
geom_vline(xintercept = 0, colour = "darkgrey", linetype="longdash") +
xlab(paste0("Axe 1 (", round(variances31[1, 3], 1), " %)")) +
ylab(paste0("Axe 2 (", round(variances31[2, 3], 1), " %)")) +
scale_shape_manual(name = "", values = 0:20) +
guides(shape=guide_legend(title = ""), size = FALSE) +
theme_minimal() +
theme(legend.position="bottom")
Point cloud of all variables (size proportionate to importance of variable)
results_all31 %>%
filter(dim1_contrib > threshold31 |
dim2_contrib > threshold31 |
is.na(dim2_contrib) & dim1_coord > 0.29 |
is.na(dim2_contrib) & dim1_coord < -0.31) %>%
ggplot(aes(x = dim1_coord, y = dim2_coord,
label = categories,
shape = variables,
colour = type,
size = n)) +
geom_point() +
coord_fixed() + #
geom_text_repel(size = 3, segment.alpha = 0.5) +
geom_hline(yintercept = 0, colour = "darkgrey", linetype="longdash") +
geom_vline(xintercept = 0, colour = "darkgrey", linetype="longdash") +
xlab(paste0("Axe 1 (", round(variances31[1, 2], 1), " %)")) +
ylab(paste0("Axe 2 (", round(variances31[2, 2], 1), " %)")) +
scale_shape_manual(name="", values = 0:20) +
scale_color_manual(values = c("black", "darkgrey")) +
# scale_color_brewer(palette = "Set1") +
# scale_color_grey() +
# scale_color_brewer(palette = "Accent")
guides(shape = guide_legend(title="Variable Name (Active & Supplementary)",
title.position = "top"),
colour = guide_legend(title = "Variable Type",
title.position = "top",
nrow = 2),
size = FALSE) +
theme_minimal() +
theme(legend.position="bottom")
Point cloud of individuals (periodicals) (colored by language, with confidence ellipses)
indiv12 <- as_tibble(res_acm31$ind$coord[,1:2])
indiv12_lang <- crowmca31fct %>%
select(Language) %>%
bind_cols(indiv12)
ggplot(indiv12_lang, aes(x = `Dim 1`, y = `Dim 2`,
colour = Language)) +
geom_point(alpha = 0.6) +
coord_fixed() +
stat_ellipse() +
geom_hline(yintercept = 0, colour = "darkgrey", linetype="longdash") +
geom_vline(xintercept = 0, colour = "darkgrey", linetype="longdash") +
xlab(paste0("Axe 1 (", round(variances31[1, 3], 1), " %)")) +
ylab(paste0("Axe 2 (", round(variances31[2, 3], 1), " %)")) +
scale_color_brewer(palette = "Set2") +
guides(colour = guide_legend(title="Language")) +
theme_minimal()
Point cloud of individuals (periodicals) (colored by periodicity, with confidence ellipses)
indiv12_periodicity <- crowmca31fct %>%
select(Periodicity) %>%
bind_cols(indiv12)
ggplot(indiv12_periodicity, aes(x = `Dim 1`, y = `Dim 2`,
colour = Periodicity)) +
geom_point(alpha = 0.6) +
coord_fixed() +
stat_ellipse() +
geom_hline(yintercept = 0, colour = "darkgrey", linetype="longdash") +
geom_vline(xintercept = 0, colour = "darkgrey", linetype="longdash") +
xlab(paste0("Axe 1 (", round(variances31[1, 3], 1), " %)")) +
ylab(paste0("Axe 2 (", round(variances31[2, 3], 1), " %)")) +
scale_color_brewer(palette = "Set2") +
guides(colour = guide_legend(title="Periodicity")) +
theme_minimal()
Alternative approach with factoextra
Point cloud of individuals with confidence ellipse (language)
grp <- as.factor(crowmca31fct[, "Language"])
fviz_mca_ind(res_acm31, habillage = grp, label = FALSE,
addEllipses = TRUE, repel = TRUE, title = "Graph of Periodicals: Language (1931)")
Point cloud of individuals with confidence ellipse (periodicity)
grp <- as.factor(crowmca31fct[, "Periodicity"])
fviz_mca_ind(res_acm31, habillage = grp, label = FALSE,
addEllipses = TRUE, repel = TRUE, title = "Graph of Periodicals: Periodicity (1931)")
res.MCA1<-MCA(crowmca31,graph=FALSE)
Altogether, the two first dimensions capture almost 17% of information (10.67% on the first dimension, 6.35% on the second), which is not so bad given the high number of observations and variables in the dataset. 12 dimensions are necessary to capture at least 50% of information, 22 dimensions to retain 75% of information, and 48 for 100%.
Visualize variables contributions on axis 1:
fviz_contrib(res.MCA1, choice ="var", axes = 1)
Visualize variables contributions on axis 2:
fviz_contrib(res.MCA1, choice ="var", axes = 2)
Visualize individual contributions on first axis (top 20):
fviz_contrib(res.MCA1, choice ="ind", axes = 1, top = 20)
Plot the results:
plot.MCA(res.MCA1, choix='var',title="Newspaper directory (1931): Graph of variables",col.var=c(1,2,3,4,5,6,7,8,9,10,11))
plot.MCA(res.MCA1,invisible= 'ind',selectMod= 'cos2 0',col.var=c(1,1,1,1,1,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,5,5,5,5,6,6,6,6,6,7,7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,10,10,10,10,11,11,11,11,11,11),title="Newspaper directory (1931): ACM graph",label =c('var'))
Improve the visualization with Factoextra:
# Graph of variable categories:
fviz_mca_var(res.MCA1, repel = TRUE)
# Biplot of individuals and variables:
fviz_mca_biplot(res.MCA1, repel = TRUE)
The graph of variables delineates four main groups:
One the second graph (ACM), the first dimension clearly opposes Chinese-language daily broadsheets (on the left) and foreign-language (English), small-size (octavo) periodicals with less frequent periodicity (monthly, quarterly, annual) (on the right). The latter referred to earlier, pioneering publications (1829-1903), whereas the former included the most recently established periodicals (1928-1935). We can see a gradient from the most to the least frequent and recent as we move along the x axis from left to right. The second dimension further separates foreign commercial publishers (newspaper, newsgroup, private entreprises) above and Chinese publishers (organization, academic, publishing house) below. As we move downward, the same dimension delineates a gradient from less (10-50 pages) to more substantial periodicals (100-500 pages). The former group was associated with weeklies, the latter with monthlies.
Just run the following lines of code to obtain the summary of results and dimensions:
summary(res.MCA1)
dim(res.MCA1)
We can launch explor to interact with the graph:
explor(res.MCA1)
Each variable is represented by a distinct color. Larger circles reflect stronger contribution:
res <- explor::prepare_results(res.MCA1)
explor::MCA_var_plot(res, xax = 1, yax = 2, var_sup = FALSE, var_sup_choice = ,
var_lab_min_contrib = 0, col_var = "Variable", symbol_var = NULL, size_var = "Contrib",
size_range = c(52.5, 700), labels_size = 10, point_size = 56, transitions = TRUE,
labels_positions = "auto", labels_prepend_var = FALSE, xlim = c(-3.28, 4.81),
ylim = c(-2.94, 5.14))
Distribution of periodicals according to their language:
explor::MCA_ind_plot(res, xax = 1, yax = 2, ind_sup = FALSE, lab_var = NULL,
ind_lab_min_contrib = 0, col_var = "Language", labels_size = 9, point_opacity = 0.5,
opacity_var = NULL, point_size = 64, ellipses = TRUE, transitions = TRUE,
labels_positions = NULL, xlim = c(-2.12, 3.31), ylim = c(-2.15, 3.28))
Distribution of periodicals according to their periodicity (stronger opacity reflects lower quality of projection):
explor::MCA_ind_plot(res, xax = 1, yax = 2, ind_sup = FALSE, lab_var = NULL,
ind_lab_min_contrib = 0, col_var = "Periodicity", labels_size = 9, point_opacity = 0.5,
opacity_var = "Cos2", point_size = 64, ellipses = TRUE, transitions = TRUE,
labels_positions = NULL, xlim = c(-2.12, 3.31), ylim = c(-2.15, 3.28))
Finally, we apply hierarchical clustering on all 48 dimensions:
res.MCA<-MCA(crowmca31,ncp=48,graph=FALSE)
res.HCPC<-HCPC(res.MCA,nb.clust=4,consol=FALSE,graph=FALSE)
plot.HCPC(res.HCPC,choice='tree',title='Newspaper Directory (1931): Hierarchical Tree')
plot.HCPC(res.HCPC,choice='map',draw.tree=FALSE,title='Newspaper Directory (1931): Factor Map')
plot.HCPC(res.HCPC,choice='3D.map',ind.names=FALSE,centers.plot=FALSE,angle=60,title='Newspaper Directory (1931): Factor Map on Hierarchical Tree')
The partition is characterized by (by decreasing importance): the number of pages, the size of page, publishers’ profile and nationality, periodicity, layout (number of column), language, year of establishment, and circulation data (audited). The place of publication (province) is not significant in the partition.
On this basis, the algorithm detected 4 clusters of periodicals:
The following lines of code generates a summary of results:
summary(res.HCPC)
Alternative visualization using FactoExtra
res.mca <- MCA(crowmca31fct,
ncp = 48, # Number of components kept : 48
quali.sup = c(3:4), # Qualitative supplementary variables (province, founded)
graph=FALSE)
res.hcpc <- HCPC (res.mca, graph = FALSE)
Dendograms:
fviz_dend(res.hcpc, show_labels = FALSE,
main = "Cluster dendogram of Chinese periodicals (1931)",
caption = "Based on 'Newspaper Directory of China' (1931)")
Individuals (periodicals) factor map:
fviz_cluster(res.hcpc, geom = "point",
main = "Factor map of Chinese periodicals (1931)",
caption = "Based on 'Newspaper Directory of China' (1931)")
Statistical Description:
res.hcpc$desc.var$test.chi2 # Variables
desccat <- res.hcpc$desc.var$category # by variable categories
res.hcpc$desc.axes # by principal components
res.hcpc$desc.ind$para # by individuals
We perform a MCA on the second dataset from the 1935 directory. The dataset contains 703 periodicals and 11 variables. We transform all variables into factor variables as required by MCA. We consider all variables as active variables, except for the place of publication (Province) and the period of founding (Founded), which we set as supplementary variables.
crowmca35fct <- crowmca35 %>% mutate_all(factor)
res_acm35 <- MCA(crowmca35fct, quali.sup = 3:4)
res_acm35
## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 703 individuals, described by 11 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. of the categories"
## 4 "$var$cos2" "cos2 for the categories"
## 5 "$var$contrib" "contributions of the categories"
## 6 "$var$v.test" "v-test for the categories"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$quali.sup" "results for the supplementary categorical variables"
## 12 "$quali.sup$coord" "coord. for the supplementary categories"
## 13 "$quali.sup$cos2" "cos2 for the supplementary categories"
## 14 "$quali.sup$v.test" "v-test for the supplementary categories"
## 15 "$call" "intermediate results"
## 16 "$call$marge.col" "weights of columns"
## 17 "$call$marge.li" "weights of rows"
Dimensions description
variances35 <- as.data.frame(res_acm35$eig) %>%
rownames_to_column() %>% # retrieve row names (dim 1, dim 2, etc) in a distinct column
slice(1:10) %>% # retain information on the 10 first dimensions only
mutate(Axis = str_replace_all(rowname, "dim", "Axis")) %>% # create a new variable from rowname, with values "Axis 1, Axis 2, etc. instead of "dim 1, dim 2.." etc)
select(-rowname) %>% # remove this column since we don't need it anymore
mutate(Axis = fct_relevel(Axis, paste("Axis", 1:10))) %>% # to ensure the dimensions are ordered correctly on the graphs
select(Axis, `eigenvalue`, `percentage of variance`, `cumulative percentage of variance`)
variances35 %>%
flextable() %>%
colformat_double(decimal.mark = ",", digits = 2) %>%
autofit() %>%
set_caption("Eigenvalue & variances along axes")
Axis | eigenvalue | percentage of variance | cumulative percentage of variance |
Axis 1 | 0,50 | 10,69 | 10,69 |
Axis 2 | 0,33 | 7,11 | 17,81 |
Axis 3 | 0,25 | 5,46 | 23,27 |
Axis 4 | 0,22 | 4,70 | 27,97 |
Axis 5 | 0,21 | 4,44 | 32,41 |
Axis 6 | 0,18 | 3,93 | 36,34 |
Axis 7 | 0,17 | 3,56 | 39,90 |
Axis 8 | 0,16 | 3,38 | 43,28 |
Axis 9 | 0,15 | 3,23 | 46,51 |
Axis 10 | 0,13 | 2,86 | 49,37 |
# pour exporter ce tableau, plusieurs solutions possibles, notamment :
write_csv2(variances35, "variances35.csv")
Variance
ggplot(variances35, aes(x = Axis)) +
geom_bar(aes(y = `percentage of variance`),
stat = "identity",
fill = "red") +
xlab("") +
ylab("% of variance") +
theme_minimal()
Cumulative variance
ggplot(variances35, aes(x = Axis)) +
geom_bar(aes(y = `cumulative percentage of variance`),
stat = "identity",
fill = "blue") +
xlab("") +
ylab("cumulative percentage of variance") +
theme_minimal()
Contribution threshold
threshold35 <- 100 / nrow(res_acm35$var$coord)
Variables frequency
frequences35 <- crowmca35fct %>%
pivot_longer(everything(),
names_to = "variables",
values_to = "categories") %>% # count all occurrences of variable/category
count(variables, categories) %>% # count only unique pairs of variable/category (i.e. number of unique individuals for each category)
group_by(variables) %>%
mutate(percentage = round(100 * n / nrow(crowmca35fct), 1)) %>% # compute percentage for each group of variables
ungroup() %>%
select(variables, categories, n, percentage) # re-order variables
frequences35 %>%
flextable() %>%
colformat_double(decimal.mark = ",", digits = 1) %>%
autofit() %>%
set_caption("Variables Frequency on Crow 1935 dataset")
variables | categories | n | percentage |
Audited | Audited | 9 | 1,3 |
Audited | Publisher's statement | 670 | 95,3 |
Audited | 24 | 3,4 | |
Circulation | [0,500] | 38 | 5,4 |
Circulation | (500,1e+03] | 106 | 15,1 |
Circulation | (1e+03,2.5e+03] | 188 | 26,7 |
Circulation | (2.5e+03,5e+03] | 166 | 23,6 |
Circulation | (5e+03,1e+04] | 96 | 13,7 |
Circulation | (1e+04,5e+04] | 72 | 10,2 |
Circulation | (5e+04,1.5e+05] | 13 | 1,8 |
Circulation | 24 | 3,4 | |
ColNbr | [0,3] | 16 | 2,3 |
ColNbr | (3,5] | 9 | 1,3 |
ColNbr | (5,8] | 38 | 5,4 |
ColNbr | (8,10] | 2 | 0,3 |
ColNbr | (10,13] | 15 | 2,1 |
ColNbr | 623 | 88,6 | |
Founded | 1829-1903 | 15 | 2,1 |
Founded | 1904-1916 | 60 | 8,5 |
Founded | 1917-1927 | 146 | 20,8 |
Founded | 1928-1935 | 443 | 63,0 |
Founded | 39 | 5,5 | |
Language | Other | 7 | 1,0 |
Language | Chinese | 620 | 88,2 |
Language | English | 70 | 10,0 |
Language | Japanese | 6 | 0,9 |
Nationality | Other | 13 | 1,8 |
Nationality | Chinese | 82 | 11,7 |
Nationality | American | 16 | 2,3 |
Nationality | British | 24 | 3,4 |
Nationality | 568 | 80,8 | |
PageNbr | [0,10] | 468 | 66,6 |
PageNbr | (10,25] | 97 | 13,8 |
PageNbr | (25,50] | 63 | 9,0 |
PageNbr | (50,100] | 33 | 4,7 |
PageNbr | (100,500] | 35 | 5,0 |
PageNbr | (500,1.2e+03] | 4 | 0,6 |
PageNbr | 3 | 0,4 | |
PageSize | Octavo | 163 | 23,2 |
PageSize | Compact | 164 | 23,3 |
PageSize | Broadsheet | 373 | 53,1 |
PageSize | 3 | 0,4 | |
Periodicity | AQSM | 24 | 3,4 |
Periodicity | BiWM | 21 | 3,0 |
Periodicity | Daily | 503 | 71,6 |
Periodicity | Monthly | 80 | 11,4 |
Periodicity | Weekly | 75 | 10,7 |
Province | CT | 44 | 6,3 |
Province | NNE | 147 | 20,9 |
Province | NW | 9 | 1,3 |
Province | SE | 453 | 64,4 |
Province | SW | 50 | 7,1 |
Publisher | Other | 4 | 0,6 |
Publisher | Official | 16 | 2,3 |
Publisher | Organization | 30 | 4,3 |
Publisher | Private | 16 | 2,3 |
Publisher | Newsgroup | 22 | 3,1 |
Publisher | Newspaper | 7 | 1,0 |
Publisher | Publishing house | 31 | 4,4 |
Publisher | Academic | 9 | 1,3 |
Publisher | 568 | 80,8 |
write_csv2(frequences35, "frequences35.csv")
** Active variables**
# Coordinates
coordinates35 <- as_tibble(res_acm35$var$coord,
rownames = "categories") %>% # retrieve coordinates
mutate_if(is.numeric, round, digits = 2) %>% # retain just 2 digits
rename_all(tolower) %>% # remove upper case characters
rename_all(~ str_replace(., " ", "")) %>% # remove white spaces
rename_if(is.numeric, ~ str_c(., "coord", sep = "_")) # add suffix _coord to variable name
# Contributions
contributions35 <- as_tibble(res_acm35$var$contrib,
rownames = "categories") %>% # retrieve contributions
mutate_if(is.numeric, round, digits = 2) %>% # retain just 2 digits
rename_all(tolower) %>% # remove upper case characters
rename_all(~ str_replace(., " ", "")) %>% # remove white spaces
rename_if(is.numeric, ~ str_c(., "contrib", sep = "_")) # add suffix _contrib to variable name
# Square cosine
cos2_35 <- as_tibble(res_acm35$var$cos2,
rownames = "categories") %>% # retrieve cos2
mutate_if(is.numeric, round, digits = 2) %>% # retain just 2 digits
rename_all(tolower) %>% # remove upper case
rename_all(~ str_replace(., " ", "")) %>% # remove white spaces
rename_if(is.numeric, ~ str_c(., "cos2", sep = "_")) # add suffix _cos2
# vtest
vtest35 <- as_tibble(res_acm35$var$v.test,
rownames = "categories") %>% # retrieve v.test values
mutate_if(is.numeric, round, digits = 2) %>% # retain just 2 digits
rename_all(tolower) %>% # remove upper case
rename_all(~ str_replace(., " ", "")) %>% # remove white spaces
rename_if(is.numeric, ~ str_c(., "vtest", sep = "_")) # add suffix _vtest
# Re-assemble results
results_active35 <- frequences35 %>%
right_join(coordinates35) %>%
right_join(contributions35) %>%
right_join(cos2_35) %>%
right_join(vtest35) %>% # merge data using "categories"
mutate(type = "Active") %>% # add column to specify the type of variable (active)
select(type, variables, categories, n, percentage,
contains("dim1"), contains("dim2"),
contains("dim3"), contains("dim4")) # reorder variables according to their relevance for each axis
results_active35 %>%
flextable() %>%
colformat_double(decimal.mark = ",", digits = 1) %>%
autofit() %>%
set_caption("Active Variables: Statistical Results")
type | variables | categories | n | percentage | dim1_coord | dim1_contrib | dim1_cos2 | dim1_vtest | dim2_coord | dim2_contrib | dim2_cos2 | dim2_vtest | dim3_coord | dim3_contrib | dim3_cos2 | dim3_vtest | dim4_coord | dim4_contrib | dim4_cos2 | dim4_vtest |
Active | Audited | Audited | 9 | 1,3 | 2,3 | 1,5 | 0,1 | 6,8 | 2,5 | 2,7 | 0,1 | 7,5 | -2,6 | 3,7 | 0,1 | -7,8 | 2,9 | 5,3 | 0,1 | 8,6 |
Active | Audited | Publisher's statement | 670 | 95,3 | -0,1 | 0,3 | 0,3 | -14,7 | -0,1 | 0,3 | 0,2 | -12,3 | -0,0 | 0,1 | 0,0 | -5,3 | 0,1 | 0,1 | 0,1 | 5,9 |
Active | Circulation | [0,500] | 38 | 5,4 | -0,1 | 0,0 | 0,0 | -0,5 | -0,3 | 0,2 | 0,0 | -2,0 | -0,5 | 0,5 | 0,0 | -2,9 | -0,4 | 0,5 | 0,0 | -2,7 |
Active | Circulation | (500,1e+03] | 106 | 15,1 | -0,2 | 0,1 | 0,0 | -2,4 | -0,2 | 0,2 | 0,0 | -2,4 | -0,2 | 0,4 | 0,0 | -2,8 | -0,2 | 0,3 | 0,0 | -2,2 |
Active | Circulation | (1e+03,2.5e+03] | 188 | 26,7 | -0,1 | 0,1 | 0,0 | -2,0 | -0,0 | 0,0 | 0,0 | -0,7 | -0,2 | 0,4 | 0,0 | -3,0 | 0,1 | 0,3 | 0,0 | 2,2 |
Active | Circulation | (2.5e+03,5e+03] | 166 | 23,6 | -0,1 | 0,1 | 0,0 | -2,0 | -0,0 | 0,0 | 0,0 | -0,1 | -0,1 | 0,1 | 0,0 | -1,1 | -0,1 | 0,1 | 0,0 | -1,1 |
Active | Circulation | (5e+03,1e+04] | 96 | 13,7 | 0,1 | 0,0 | 0,0 | 0,9 | -0,0 | 0,0 | 0,0 | -0,3 | -0,0 | 0,0 | 0,0 | -0,2 | 0,4 | 1,3 | 0,0 | 4,5 |
Active | Circulation | (1e+04,5e+04] | 72 | 10,2 | 0,0 | 0,0 | 0,0 | 0,4 | -0,2 | 0,1 | 0,0 | -1,6 | 0,4 | 0,8 | 0,0 | 3,7 | 0,5 | 1,3 | 0,0 | 4,4 |
Active | Circulation | (5e+04,1.5e+05] | 13 | 1,8 | -0,1 | 0,0 | 0,0 | -0,5 | 1,1 | 0,7 | 0,0 | 3,9 | 0,8 | 0,6 | 0,0 | 3,0 | 0,4 | 0,1 | 0,0 | 1,5 |
Active | ColNbr | [0,3] | 16 | 2,3 | 3,0 | 4,6 | 0,2 | 12,2 | 0,1 | 0,0 | 0,0 | 0,5 | -1,8 | 3,0 | 0,1 | -7,1 | -1,4 | 2,3 | 0,1 | -5,7 |
Active | ColNbr | (3,5] | 9 | 1,3 | 0,9 | 0,2 | 0,0 | 2,8 | 1,4 | 0,8 | 0,0 | 4,2 | -0,5 | 0,1 | 0,0 | -1,5 | 0,2 | 0,0 | 0,0 | 0,8 |
Active | ColNbr | (5,8] | 38 | 5,4 | 0,9 | 1,0 | 0,1 | 5,8 | 2,1 | 8,0 | 0,2 | 13,3 | -0,9 | 2,0 | 0,1 | -5,8 | 1,3 | 4,9 | 0,1 | 8,5 |
Active | ColNbr | (8,10] | 2 | 0,3 | -0,5 | 0,0 | 0,0 | -0,7 | 1,0 | 0,1 | 0,0 | 1,5 | 0,6 | 0,0 | 0,0 | 0,8 | 1,1 | 0,2 | 0,0 | 1,6 |
Active | ColNbr | (10,13] | 15 | 2,1 | 0,2 | 0,0 | 0,0 | 0,8 | 2,5 | 4,3 | 0,1 | 9,6 | 3,8 | 13,6 | 0,3 | 14,9 | 1,2 | 1,5 | 0,0 | 4,5 |
Active | PageNbr | [0,10] | 468 | 66,6 | -0,5 | 4,0 | 0,5 | -19,3 | 0,0 | 0,0 | 0,0 | 1,6 | -0,1 | 0,1 | 0,0 | -1,8 | -0,2 | 1,1 | 0,1 | -6,6 |
Active | PageNbr | (10,25] | 97 | 13,8 | 0,3 | 0,3 | 0,0 | 3,5 | 1,1 | 5,3 | 0,2 | 11,4 | 0,1 | 0,1 | 0,0 | 1,6 | 0,7 | 3,6 | 0,1 | 7,6 |
Active | PageNbr | (25,50] | 63 | 9,0 | 1,6 | 5,4 | 0,3 | 13,7 | -0,4 | 0,4 | 0,0 | -3,0 | -0,6 | 1,2 | 0,0 | -4,6 | 0,1 | 0,0 | 0,0 | 0,4 |
Active | PageNbr | (50,100] | 33 | 4,7 | 1,6 | 2,7 | 0,1 | 9,4 | -1,1 | 1,8 | 0,1 | -6,4 | 0,2 | 0,1 | 0,0 | 1,4 | 0,1 | 0,0 | 0,0 | 0,3 |
Active | PageNbr | (100,500] | 35 | 5,0 | 1,1 | 1,3 | 0,1 | 6,6 | -1,9 | 6,1 | 0,2 | -11,6 | 1,0 | 2,3 | 0,1 | 6,2 | 1,3 | 4,0 | 0,1 | 7,6 |
Active | PageNbr | (500,1.2e+03] | 4 | 0,6 | 3,4 | 1,5 | 0,1 | 6,9 | 0,4 | 0,0 | 0,0 | 0,8 | 0,4 | 0,0 | 0,0 | 0,7 | -6,8 | 13,5 | 0,3 | -13,7 |
Active | PageSize | Octavo | 163 | 23,2 | 1,2 | 7,7 | 0,5 | 17,8 | -0,9 | 6,4 | 0,2 | -13,2 | 0,1 | 0,2 | 0,0 | 2,2 | 0,2 | 0,3 | 0,0 | 2,3 |
Active | PageSize | Compact | 164 | 23,3 | -0,4 | 0,9 | 0,1 | -6,2 | -0,1 | 0,0 | 0,0 | -1,1 | -0,2 | 0,6 | 0,0 | -3,5 | -0,3 | 1,3 | 0,0 | -4,8 |
Active | PageSize | Broadsheet | 373 | 53,1 | -0,4 | 1,6 | 0,1 | -10,3 | 0,4 | 3,2 | 0,2 | 12,0 | 0,1 | 0,1 | 0,0 | 1,4 | 0,1 | 0,4 | 0,0 | 3,6 |
Active | Periodicity | AQSM | 24 | 3,4 | 1,9 | 2,6 | 0,1 | 9,2 | -1,0 | 1,2 | 0,0 | -5,1 | 0,9 | 1,3 | 0,0 | 4,7 | -1,3 | 2,9 | 0,1 | -6,5 |
Active | Periodicity | BiWM | 21 | 3,0 | -0,3 | 0,1 | 0,0 | -1,2 | -0,6 | 0,4 | 0,0 | -2,9 | -0,1 | 0,0 | 0,0 | -0,6 | -0,5 | 0,4 | 0,0 | -2,3 |
Active | Periodicity | Daily | 503 | 71,6 | -0,4 | 2,8 | 0,4 | -17,6 | 0,3 | 2,1 | 0,2 | 12,4 | 0,0 | 0,0 | 0,0 | 0,6 | 0,0 | 0,0 | 0,0 | 0,4 |
Active | Periodicity | Monthly | 80 | 11,4 | 1,4 | 4,6 | 0,2 | 12,8 | -1,3 | 6,5 | 0,2 | -12,4 | 0,3 | 0,4 | 0,0 | 2,7 | 0,8 | 3,2 | 0,1 | 7,1 |
Active | Periodicity | Weekly | 75 | 10,7 | 0,9 | 1,8 | 0,1 | 7,9 | -0,1 | 0,0 | 0,0 | -0,7 | -0,7 | 2,1 | 0,1 | -6,2 | -0,3 | 0,5 | 0,0 | -2,9 |
Active | Language_Chinese | -0,3 | 1,4 | 0,5 | -19,5 | -0,1 | 0,6 | 0,1 | -10,0 | 0,1 | 0,2 | 0,0 | 4,6 | -0,0 | 0,0 | 0,0 | -0,6 | |||
Active | Language_English | 2,2 | 10,6 | 0,5 | 19,3 | 0,6 | 1,3 | 0,0 | 5,6 | -1,1 | 5,5 | 0,1 | -9,9 | -0,1 | 0,0 | 0,0 | -0,7 | |||
Active | Language_Japanese | 0,7 | 0,1 | 0,0 | 1,7 | 4,1 | 4,7 | 0,1 | 10,0 | 6,0 | 13,2 | 0,3 | 14,7 | 0,8 | 0,2 | 0,0 | 1,8 | |||
Active | Language_Other | 1,4 | 0,4 | 0,0 | 3,7 | 2,5 | 2,0 | 0,1 | 6,6 | 0,5 | 0,1 | 0,0 | 1,2 | 0,9 | 0,4 | 0,0 | 2,5 | |||
Active | Publisher_Academic | 1,6 | 0,7 | 0,0 | 4,7 | -0,7 | 0,2 | 0,0 | -2,1 | 1,1 | 0,7 | 0,0 | 3,4 | -0,5 | 0,1 | 0,0 | -1,4 | |||
Active | Publisher_Newsgroup | 1,8 | 2,1 | 0,1 | 8,3 | 1,6 | 2,6 | 0,1 | 7,5 | -1,9 | 5,0 | 0,1 | -9,2 | 1,1 | 2,1 | 0,0 | 5,5 | |||
Active | Publisher_Newspaper | 1,2 | 0,3 | 0,0 | 3,2 | 3,7 | 4,5 | 0,1 | 9,8 | 3,7 | 6,0 | 0,1 | 9,8 | 2,1 | 2,3 | 0,1 | 5,7 | |||
Active | Publisher_Official | 0,6 | 0,2 | 0,0 | 2,3 | -1,5 | 1,7 | 0,1 | -6,1 | 0,9 | 0,9 | 0,0 | 3,8 | 1,2 | 1,7 | 0,0 | 5,0 | |||
Active | Publisher_Organization | 1,1 | 1,2 | 0,1 | 6,3 | -1,5 | 3,3 | 0,1 | -8,5 | 0,5 | 0,5 | 0,0 | 3,0 | 0,8 | 1,4 | 0,0 | 4,4 | |||
Active | Publisher_Other | 2,5 | 0,8 | 0,0 | 4,9 | -0,5 | 0,0 | 0,0 | -0,9 | -1,9 | 0,9 | 0,0 | -3,8 | 1,4 | 0,6 | 0,0 | 2,9 | |||
Active | Publisher_Private | 2,4 | 2,9 | 0,1 | 9,7 | 1,4 | 1,4 | 0,0 | 5,5 | -1,5 | 2,2 | 0,1 | -6,0 | 0,1 | 0,0 | 0,0 | 0,6 | |||
Active | Publisher_Publishing house | 2,2 | 4,7 | 0,2 | 12,4 | -0,8 | 1,1 | 0,0 | -4,8 | 1,0 | 1,8 | 0,0 | 5,5 | -0,4 | 0,3 | 0,0 | -2,2 | |||
Active | Publisher.NA | -0,4 | 2,7 | 0,6 | -21,0 | 0,0 | 0,0 | 0,0 | 2,1 | -0,0 | 0,1 | 0,0 | -2,2 | -0,1 | 0,7 | 0,1 | -7,2 | |||
Active | Nationality_American | 2,7 | 3,7 | 0,2 | 10,9 | 1,3 | 1,4 | 0,0 | 5,4 | -0,7 | 0,5 | 0,0 | -2,8 | -1,2 | 1,6 | 0,0 | -4,8 | |||
Active | Nationality_British | 2,4 | 4,4 | 0,2 | 12,0 | 1,1 | 1,3 | 0,0 | 5,2 | -2,4 | 8,3 | 0,2 | -11,8 | 0,2 | 0,1 | 0,0 | 1,1 | |||
Active | Nationality_Chinese | 1,2 | 3,8 | 0,2 | 11,6 | -1,3 | 6,8 | 0,2 | -12,8 | 0,9 | 4,6 | 0,1 | 9,2 | 0,7 | 3,1 | 0,1 | 7,0 | |||
Active | Nationality_Other | 1,6 | 1,0 | 0,1 | 5,6 | 3,1 | 5,8 | 0,2 | 11,1 | 1,0 | 0,8 | 0,0 | 3,6 | 2,3 | 4,8 | 0,1 | 8,3 | |||
Active | Nationality.NA | -0,4 | 2,7 | 0,6 | -21,0 | 0,0 | 0,0 | 0,0 | 2,1 | -0,0 | 0,1 | 0,0 | -2,2 | -0,1 | 0,7 | 0,1 | -7,2 | |||
Active | Audited.NA | 2,6 | 5,1 | 0,2 | 12,9 | 1,9 | 4,3 | 0,1 | 9,7 | 2,2 | 7,2 | 0,2 | 11,0 | -2,5 | 10,3 | 0,2 | -12,2 | |||
Active | Circulation.NA | 2,6 | 5,1 | 0,2 | 12,9 | 1,9 | 4,3 | 0,1 | 9,7 | 2,2 | 7,2 | 0,2 | 11,0 | -2,5 | 10,3 | 0,2 | -12,2 | |||
Active | PageNbr.NA | 0,3 | 0,0 | 0,0 | 0,5 | -0,1 | 0,0 | 0,0 | -0,1 | -1,0 | 0,2 | 0,0 | -1,7 | -2,6 | 1,4 | 0,0 | -4,5 | |||
Active | PageSize.NA | 2,2 | 0,5 | 0,0 | 3,9 | 0,0 | 0,0 | 0,0 | 0,0 | -1,4 | 0,3 | 0,0 | -2,3 | -6,1 | 8,1 | 0,2 | -10,6 | |||
Active | ColNbr.NA | -0,1 | 0,4 | 0,2 | -11,1 | -0,2 | 1,4 | 0,4 | -15,8 | 0,0 | 0,0 | 0,0 | 1,1 | -0,1 | 0,3 | 0,1 | -6,0 |
write_csv2(results_active35, "results_active35.csv")
Supplementary variables
# Coordinates
coord_sup35 <- as_tibble(res_acm35$quali.sup$coord,
rownames = "categories") %>%
mutate_if(is.numeric, round, digits = 2) %>%
rename_all(tolower) %>%
rename_all(~ str_replace(., " ", "")) %>%
rename_if(is.numeric, ~ str_c(., "coord", sep = "_"))
# Cosinus carrés
cos2_sup35 <- as_tibble(res_acm35$quali.sup$cos2,
rownames = "categories") %>%
mutate_if(is.numeric, round, digits = 2) %>%
rename_all(tolower) %>%
rename_all(~ str_replace(., " ", "")) %>%
rename_if(is.numeric, ~ str_c(., "cos2", sep = "_"))
# vtest
vtest_sup35 <- as_tibble(res_acm35$quali.sup$v.test,
rownames = "categories") %>%
mutate_if(is.numeric, round, digits = 2) %>%
rename_all(tolower) %>%
rename_all(~ str_replace(., " ", "")) %>%
rename_if(is.numeric, ~ str_c(., "vtest", sep = "_"))
# Reassemble
results_sup35 <- frequences35 %>%
right_join(coord_sup35) %>%
right_join(cos2_sup35) %>%
right_join(vtest_sup35) %>%
mutate(type = "Supplementary") %>%
select(type, variables, categories, n, percentage,
contains("dim1"), contains("dim2"),
contains("dim3"), contains("dim4"))
results_sup35 %>%
flextable() %>%
colformat_double(decimal.mark = ",", digits = 1) %>%
autofit() %>%
set_caption("Supplementary Variables: Statistical Results")
type | variables | categories | n | percentage | dim1_coord | dim1_cos2 | dim1_vtest | dim2_coord | dim2_cos2 | dim2_vtest | dim3_coord | dim3_cos2 | dim3_vtest | dim4_coord | dim4_cos2 | dim4_vtest |
Supplementary | Founded | 1829-1903 | 15 | 2,1 | 1,0 | 0,0 | 3,9 | 1,0 | 0,0 | 3,8 | -1,1 | 0,0 | -4,5 | 1,2 | 0,0 | 4,6 |
Supplementary | Founded | 1904-1916 | 60 | 8,5 | -0,1 | 0,0 | -0,8 | 0,3 | 0,0 | 2,4 | 0,3 | 0,0 | 2,8 | 0,2 | 0,0 | 1,9 |
Supplementary | Founded | 1917-1927 | 146 | 20,8 | -0,2 | 0,0 | -2,2 | 0,1 | 0,0 | 1,3 | 0,1 | 0,0 | 1,3 | 0,1 | 0,0 | 0,6 |
Supplementary | Founded | 1928-1935 | 443 | 63,0 | -0,1 | 0,0 | -4,2 | -0,1 | 0,0 | -5,3 | -0,0 | 0,0 | -0,5 | -0,1 | 0,0 | -2,7 |
Supplementary | Province | CT | 44 | 6,3 | -0,5 | 0,0 | -3,7 | -0,0 | 0,0 | -0,2 | -0,1 | 0,0 | -0,5 | -0,1 | 0,0 | -1,1 |
Supplementary | Province | NNE | 147 | 20,9 | -0,3 | 0,0 | -3,7 | 0,2 | 0,0 | 3,5 | 0,0 | 0,0 | 0,5 | 0,0 | 0,0 | 0,6 |
Supplementary | Province | NW | 9 | 1,3 | -0,6 | 0,0 | -1,8 | 0,1 | 0,0 | 0,3 | -0,1 | 0,0 | -0,2 | -0,1 | 0,0 | -0,4 |
Supplementary | Province | SE | 453 | 64,4 | 0,2 | 0,1 | 7,3 | -0,1 | 0,0 | -2,8 | 0,0 | 0,0 | 0,1 | 0,0 | 0,0 | 0,3 |
Supplementary | Province | SW | 50 | 7,1 | -0,5 | 0,0 | -3,4 | -0,0 | 0,0 | -0,2 | -0,1 | 0,0 | -0,4 | -0,0 | 0,0 | -0,3 |
Supplementary | Founded.NA | 1,8 | 0,2 | 11,3 | 0,6 | 0,0 | 3,7 | -0,3 | 0,0 | -1,8 | -0,1 | 0,0 | -0,5 |
write_csv2(results_sup35, "results_sup35.csv")
Assemble active and supplementary variables
results_all35 <- bind_rows(results_active35, results_sup35)
write_csv2(results_all35, "results_all35.csv")
Point cloud of active variables (size proportionate to importance of variable)
results_active35 %>%
filter(dim1_contrib > threshold35 |
dim2_contrib > threshold35) %>%
ggplot(aes(x = dim1_coord, y = dim2_coord,
label = categories,
shape = variables,
size = n)) +
geom_point() +
coord_fixed() +
geom_text_repel(size = 3, segment.alpha = 0.5) +
geom_hline(yintercept = 0, colour = "darkgrey", linetype="longdash") +
geom_vline(xintercept = 0, colour = "darkgrey", linetype="longdash") +
xlab(paste0("Axe 1 (", round(variances35[1, 3], 1), " %)")) +
ylab(paste0("Axe 2 (", round(variances35[2, 3], 1), " %)")) +
scale_shape_manual(name = "", values = 0:20) +
guides(shape=guide_legend(title = ""), size = FALSE) +
theme_minimal() +
theme(legend.position="bottom")
Point cloud of all variables (size proportionate to importance of variable)
results_all35 %>%
filter(dim1_contrib > threshold35 |
dim2_contrib > threshold35 |
is.na(dim2_contrib) & dim1_coord > 0.29 |
is.na(dim2_contrib) & dim1_coord < -0.31) %>%
ggplot(aes(x = dim1_coord, y = dim2_coord,
label = categories,
shape = variables,
colour = type,
size = n)) +
geom_point() +
coord_fixed() + #
geom_text_repel(size = 3, segment.alpha = 0.5) +
geom_hline(yintercept = 0, colour = "darkgrey", linetype="longdash") +
geom_vline(xintercept = 0, colour = "darkgrey", linetype="longdash") +
xlab(paste0("Axe 1 (", round(variances35[1, 2], 1), " %)")) +
ylab(paste0("Axe 2 (", round(variances35[2, 2], 1), " %)")) +
scale_shape_manual(name="", values = 0:20) +
scale_color_manual(values = c("black", "darkgrey")) +
# scale_color_brewer(palette = "Set1") +
# scale_color_grey() +
# scale_color_brewer(palette = "Accent")
guides(shape = guide_legend(title="Variable Name (Active & Supplementary)",
title.position = "top"),
colour = guide_legend(title = "Variable Type",
title.position = "top",
nrow = 2),
size = FALSE) +
theme_minimal() +
theme(legend.position="bottom")
Point cloud of individuals (periodicals) (colored by language, with confidence ellipses)
indiv12 <- as_tibble(res_acm35$ind$coord[,1:2])
indiv12_lang <- crowmca35fct %>%
select(Language) %>%
bind_cols(indiv12)
ggplot(indiv12_lang, aes(x = `Dim 1`, y = `Dim 2`,
colour = Language)) +
geom_point(alpha = 0.6) +
coord_fixed() +
stat_ellipse() +
geom_hline(yintercept = 0, colour = "darkgrey", linetype="longdash") +
geom_vline(xintercept = 0, colour = "darkgrey", linetype="longdash") +
xlab(paste0("Axe 1 (", round(variances35[1, 3], 1), " %)")) +
ylab(paste0("Axe 2 (", round(variances35[2, 3], 1), " %)")) +
scale_color_brewer(palette = "Set2") +
guides(colour = guide_legend(title="Language")) +
theme_minimal()
Point cloud of individuals (periodicals) (colored by periodicity, with confidence ellipses)
indiv12_periodicity <- crowmca35fct %>%
select(Periodicity) %>%
bind_cols(indiv12)
ggplot(indiv12_periodicity, aes(x = `Dim 1`, y = `Dim 2`,
colour = Periodicity)) +
geom_point(alpha = 0.6) +
coord_fixed() +
stat_ellipse() +
geom_hline(yintercept = 0, colour = "darkgrey", linetype="longdash") +
geom_vline(xintercept = 0, colour = "darkgrey", linetype="longdash") +
xlab(paste0("Axe 1 (", round(variances35[1, 3], 1), " %)")) +
ylab(paste0("Axe 2 (", round(variances35[2, 3], 1), " %)")) +
scale_color_brewer(palette = "Set2") +
guides(colour = guide_legend(title="Periodicity")) +
theme_minimal()
res.MCA2<-MCA(crowmca31,graph=FALSE)
Altogether, the two first dimensions capture almost 17% of information (10.67% on the first dimension, 6.35% on the second, or 7% if we treat province and founding period as supplementary variables), which is not so bad given the high number of observations and variables in the dataset. 12 dimensions are necessary to capture at least 50% of information, 22 dimensions to retain 75% of information, and 48 for 100%.
Visualize variables contributions on axis 1:
fviz_contrib(res.MCA2, choice ="var", axes = 1)
Visualize variables contributions on axis 2:
fviz_contrib(res.MCA2, choice ="var", axes = 2)
Visualize individual contributions on first axis (top 20):
fviz_contrib(res.MCA2, choice ="ind", axes = 1, top = 20)
Let’s plot the results:
plot.MCA(res.MCA2, choix='var',title="Newspaper directory (1935): Graph of variables",col.var=c(1,2,3,4,5,6,7,8,9,10,11))
plot.MCA(res.MCA2,invisible= 'ind',selectMod= 'cos2 0.05',col.var=c(1,1,1,1,1,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,5,5,5,5,6,6,6,6,6,7,7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,10,10,10,10,11,11,11,11,11,11),title="Newspaper directory (1935): ACM Graph",label =c('var'))
Altogether, the two first dimensions capture almost 18% of information (9.7% on the first dimension, 6.2% on the second), which is pretty good given the high number of variables and values in the dataset. 13 dimensions are necessary to capture at least 50% of information, 25 dimensions to retain 75% of information, and 48 for 100%.
The graph delineates four main groups of variables:
Like in the previous analysis based on the 1931 directory, the first dimension clearly opposes daily newspapers on the left with less frequent periodicals (weekly, monthly, quarterly, annual) on the right. As we move along the axis from left to right, we see a gradient from the most frequent to the least frequent publications. Moreover, the first dimension clearly separates Chinese-language newspapers on the left from foreign-language periodicals on the right (English Japanese, other European languages). It also dissociates the pioneering English publications on the right (1829-1903) from the most recent ones, mostly Chinese, on the left (1928-1935). As we move from left to right, we also see a gradient from large-size, less substantial newspapers (broadsheets) to small-size (octavo), more extensive periodicals. As in the previous analysis based on the 1931 directory, the second dimension further separates commercial publishers (newspaper, newsgroup, private enterprises) of foreign origin (mostly British and American) above and Chinese publishers below (organization, official, publishing house) primarily engaged in monthlies and less frequent periodicals. As we move downward along the second dimension we also see a gradient from more complex layout (larger number of columns) associated with Japanese and minor language newspapers, to simpler layout (fewer columns) associated with less frequent, book-model periodicals (monthlies, quaterlies, annuals).
The following lines of code generates a summary of the results and a description of the dimensions:
summary(res.MCA2)
dim(res.MCA2)
We launch explor to interact with the graph:
explor(res.MCA2)
Each variable is represented by a distinct color. Larger circles reflect stronger contribution:
res <- explor::prepare_results(res.MCA2)
explor::MCA_var_plot(res, xax = 1, yax = 2, var_sup = FALSE, var_sup_choice = ,
var_lab_min_contrib = 0, col_var = "Variable", symbol_var = NULL, size_var = "Contrib",
size_range = c(52.5, 700), labels_size = 10, point_size = 56, transitions = TRUE,
labels_positions = "auto", labels_prepend_var = FALSE, xlim = c(-2.67, 5.54),
ylim = c(-3.08, 5.13))
Distribution of periodicals according to their language:
explor::MCA_ind_plot(res, xax = 1, yax = 2, ind_sup = FALSE, lab_var = NULL,
ind_lab_min_contrib = 0, col_var = "Language", labels_size = 9, point_opacity = 0.5,
opacity_var = "Cos2", point_size = 64, ellipses = TRUE, transitions = TRUE,
labels_positions = NULL, xlim = c(-1.91, 4.34), ylim = c(-2.24, 4))
Distribution of periodicals according to their periodicity (stronger opacity reflects lower quality of projection):
explor::MCA_ind_plot(res, xax = 1, yax = 2, ind_sup = FALSE, lab_var = NULL,
ind_lab_min_contrib = 0, col_var = "Periodicity", labels_size = 9, point_opacity = 0.5,
opacity_var = "Cos2", point_size = 64, ellipses = TRUE, transitions = TRUE,
labels_positions = NULL, xlim = c(-1.91, 4.34), ylim = c(-2.24, 4))
Finally, we apply hierarchical clustering on all 48 dimensions:
res.MCA<-MCA(crowmca35,ncp=48,graph=FALSE)
res.HCPC<-HCPC(res.MCA,nb.clust=3,consol=FALSE,graph=FALSE)
plot.HCPC(res.HCPC,choice='tree',title='Newspaper Directory (1935): Hierarchical Tree')
plot.HCPC(res.HCPC,choice='map',draw.tree=FALSE,title='Newspaper Directory (1935): Factor Map')
plot.HCPC(res.HCPC,choice='3D.map',ind.names=FALSE,centers.plot=FALSE,angle=60,title='Newspaper Directory (1935): Factor Map on Hierarchical Tree')
The partition is characterized by (by decreasing importance): the number of pages, the publishers’ nationality and profile, the periodicity, the format (size of page), the language, the layout (number of columns), circulation data and the date of founding. The place of publication (province) is not significant in the partition.
On this basis, the algorithm detected 3 clusters of periodicals:
Run the following lines of code to obtain the summary of results:
summary(res.HCPC)
Alternative visualization using FactoExtra
Point cloud of individuals with confidence ellipse (language)
grp <- as.factor(crowmca35fct[, "Language"])
fviz_mca_ind(res_acm35, habillage = grp, label = FALSE,
addEllipses = TRUE, repel = TRUE, title = "Graph of Periodicals: Language (1935)")
Point cloud of individuals with confidence ellipse (periodicity)
grp <- as.factor(crowmca35fct[, "Periodicity"])
fviz_mca_ind(res_acm35, habillage = grp, label = FALSE,
addEllipses = TRUE, repel = TRUE, title = "Graph of Periodicals: Periodicity (1935)")
Cluster analysis
res.mca <- MCA(crowmca35fct,
ncp = 48, # Number of components kept : 48
quali.sup = c(3:4), # Qualitative supplementary variables (province, founded)
graph=FALSE)
res.hcpc <- HCPC (res.mca, graph = FALSE)
Dendograms:
fviz_dend(res.hcpc, show_labels = FALSE,
main = "Cluster dendogram of Chinese periodicals (1935)",
caption = "Based on 'Newspaper Directory of China' (1935)")
Individuals (periodicals) factor map:
fviz_cluster(res.hcpc, geom = "point",
main = "Factor map of Chinese periodicals (1935)",
caption = "Based on 'Newspaper Directory of China' (1935)")
Statistical Description:
res.hcpc$desc.var$test.chi2 # Variables
desccat <- res.hcpc$desc.var$category # by variable categories
res.hcpc$desc.axes # by principal components
res.hcpc$desc.ind$para # by individuals
The two MCA confirm our previous statistical analyses, especially the polarization between large-size daily newspapers with multiple columns (broadsheets) and less frequent periodicals of smaller size but more substance and simpler layout (fewer columns). MCA also highlights the exponential growth of the Chinese native press during the Republic in contrast to the pioneering British periodicals of the late imperial years. This first series of MCA suggests we should rearrange certain variables to obtain more significant results. In future MCA, it is suggested to discard the variables with too many missing values (format, layout, publisher) and focus on the best documented periodicals.
In the next section, we will apply specific MCA on Chinese periodicals only, in order to refine the previous analyses and highlight more nuanced patterns among the Chinese press.
We select Chinese-language periodicals only:
crow_1931_zh <- crow_mca_id1931 %>% filter(Language== "Chinese")
crow_1935_zh <- crow_mca_id1935 %>% filter(Language== "Chinese")
We read the first column as row names:
#1931
crow_1931_zh_tbl <- column_to_rownames(crow_1931_zh, var = "id")
crowmcazh31 <- crow_1931_zh_tbl %>% select(Periodicity, Province,
Founded, Publisher, Nationality, Audited, Circulation, PageNbr, PageSize, ColNbr)
#1935
crow_1935_zh_tbl <- column_to_rownames(crow_1935_zh, var = "id")
crowmcazh35 <- crow_1935_zh_tbl %>% select(Periodicity, Province,
Founded, Publisher, Nationality, Audited, Circulation, PageNbr, PageSize, ColNbr)
We’re all set!
The dataset for 1931 contains 295 Chinese-language periodicals.
res.MCA3<-MCA(crowmcazh31,graph=FALSE)
plot.MCA(res.MCA3, choix='var',title="The Chinese Press (1931): Graph of Variables",col.var=c(1,2,3,4,5,6,7,8,9,10))
plot.MCA(res.MCA3,invisible= 'ind',selectMod= 'cos2 0.05',col.var=c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9,9,9,9,10,10,10,10,10,10),title="The Chinese Press (1931): ACM graph",label =c('var'))
Altogether, the two first dimensions capture almost 17% of information (11.69% on the first dimension, 5.65% on the second), which is fairly good the high number of observations and variables in the dataset. 11 dimensions are necessary to capture at least 50% of information, 21 dimensions to retain 75% of information, and 43 for 100%.
The graph of variables delineates four main groups:
The following code provides a summary of the results and a description of MCA dimensions:
summary(res.MCA3)
dimdesc(res.MCA3)
We launch explor to interact with the graph:
explor(res.MCA3)
Each variable is represented by a distinct color. Larger circles reflect stronger contribution:
res <- explor::prepare_results(res.MCA3)
explor::MCA_var_plot(res, xax = 1, yax = 2, var_sup = FALSE, var_sup_choice = ,
var_lab_min_contrib = 0, col_var = "Variable", symbol_var = NULL, size_var = "Contrib",
size_range = c(52.5, 700), labels_size = 10, point_size = 56, transitions = TRUE, labels_positions = "auto",
labels_prepend_var = FALSE, xlim = c(-2.08, 4.83),
ylim = c(-3.01, 3.9))
Distribution of periodicals according to their periodicity (stronger opacity reflects lower quality of projection):
res <- explor::prepare_results(res.MCA3)
explor::MCA_ind_plot(res, xax = 1, yax = 2, ind_sup = FALSE, lab_var = NULL,
ind_lab_min_contrib = 0, col_var = "Periodicity", labels_size = 9, point_opacity = 0.5,
opacity_var = "Cos2", point_size = 64, ellipses = TRUE, transitions = TRUE, labels_positions = "auto",
xlim = c(-1.2, 3.1), ylim = c(-2.16, 2.14))
Distribution of periodicals according to their format (size of pages). Stronger opacity refers to lower quality of projection:
res <- explor::prepare_results(res.MCA3)
explor::MCA_ind_plot(res, xax = 1, yax = 2, ind_sup = FALSE, lab_var = NULL,
ind_lab_min_contrib = 0, col_var = "PageSize", labels_size = 9, point_opacity = 0.5,
opacity_var = "Cos2", point_size = 64, ellipses = TRUE, transitions = TRUE,
labels_positions = NULL, xlim = c(-1.2, 3.1), ylim = c(-2.16, 2.14))
Finally, we apply hierarchical clustering on all 43 dimensions in order to identify subgroups of periodicals:
res.MCA3<-MCA(crowmcazh31,ncp=43,graph=FALSE)
res.HCPC3<-HCPC(res.MCA3,nb.clust=3,consol=FALSE,graph=FALSE)
plot.HCPC(res.HCPC3,choice='tree',title='Chinese periodicals (1931): Hierarchical Tree')
plot.HCPC(res.HCPC3,choice='map',draw.tree=FALSE,title='Chinese periodicals (1931): Factor Map')
plot.HCPC(res.HCPC3,choice='3D.map',ind.names=FALSE,centers.plot=FALSE,angle=60,title='Chinese periodicals (1931): Factor Map on Hierarchical Tree')
The partition is determined by the following variables, by decreasing order: audit, number of column, size of page, circulation, number of pages, periodicity, publisher category, publisher’s nationality, year of founding. The place of publication (province) is not significant in the partition.
The algorithm automatically groups the periodicals into three main clusters:
The following code generates a summary of the results:
summary(res.HCPC3)
The dataset for 1935 contains 620 Chinese-language periodicals.
res.MCA4<-MCA(crowmcazh35, graph=FALSE)
plot.MCA(res.MCA4, choix='var',title="The Chinese Press (1935): Graph of Variables",col.var=c(1,2,3,4,5,6,7,8,9,10))
plot.MCA(res.MCA4,invisible= 'ind',col.var=c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9,9,9,9,10,10,10,10,10),title="The Chinese Press (1931): MCA Graph",label =c('var'))
Altogether, the two first dimensions capture 14.57% of information (8.72% on the first dimension, 5.85% on the second), which is not so bad given the high number of observations and variables in the dataset. 13 dimensions are necessary to capture at least 50% of information, 23 dimensions to retain 75% of information, and 43 for 100%.
The graph of variables delineates three main groups:
The following code provides a summary of the results and a description of MCA dimensions:
summary(res.MCA4)
dimdesc(res.MCA4)
We launch explor to interact with the graph:
explor(res.MCA4)
Each variable is represented by a distinct color. Larger circles reflect stronger contribution:
res <- explor::prepare_results(res.MCA4)
explor::MCA_var_plot(res, xax = 1, yax = 2, var_sup = FALSE, var_sup_choice = ,
var_lab_min_contrib = 0, col_var = "Variable", symbol_var = NULL, size_var = "Contrib",
size_range = c(52.5, 700), labels_size = 10, point_size = 56, transitions = TRUE,
labels_positions = "auto", labels_prepend_var = FALSE, xlim = c(-7.45, 13.6),
ylim = c(-5.22, 15.8))
Distribution of periodicals according to their periodicity (stronger opacity reflects lower quality of projection):
res <- explor::prepare_results(res.MCA4)
explor::MCA_ind_plot(res, xax = 1, yax = 2, ind_sup = FALSE, lab_var = NULL,
ind_lab_min_contrib = 0, col_var = "Periodicity", labels_size = 9, point_opacity = 0.5,
opacity_var = "Cos2", point_size = 64, ellipses = TRUE, transitions = TRUE,
labels_positions = NULL, xlim = c(-3.5, 7.3), ylim = c(-2.68, 8.13))
Distribution of periodicals according to their format (page size). Stronger opacity reflects lower quality of projection:
res <- explor::prepare_results(res.MCA4)
explor::MCA_ind_plot(res, xax = 1, yax = 2, ind_sup = FALSE, lab_var = NULL,
ind_lab_min_contrib = 0, col_var = "PageSize", labels_size = 9, point_opacity = 0.5,
opacity_var = "Cos2", point_size = 64, ellipses = TRUE, transitions = TRUE,
labels_positions = NULL, xlim = c(-3.5, 7.3), ylim = c(-2.68, 8.13))
Finally, we apply hierarchical clustering on all 43 dimensions in order to identify subgroups of periodicals:
res.MCA4<-MCA(crowmcazh35,ncp=43,graph=FALSE)
res.HCPC4<-HCPC(res.MCA,nb.clust=3,consol=FALSE,graph=FALSE)
plot.HCPC(res.HCPC4,choice='tree',title='Chinese periodicals (1935): Hierarchical Tree')
plot.HCPC(res.HCPC4,choice='map',draw.tree=FALSE,title='Chinese periodicals (1935): Factor Map')
plot.HCPC(res.HCPC4,choice='3D.map',ind.names=FALSE,centers.plot=FALSE,angle=60,title='Chinese periodicals (1935): Factor Map on Hierarchical Tree')
The partition is determined by the following variables, by decreasing order: publisher and circulation data, volume and periodicity, format and founding year. The place of publication (province) is not significant in the partition.
The algorithm automatically groups the periodicals into three main clusters:
The following code generates a summary of the results:
summary(res.HCPC4)
Husson, François, J. Josse, and Pagès J. 2010. “Principal Component Methods - Hierarchical Clustering - Partitional Clustering: Why Would We Need to Choose for Visualizing Data?” Unpublished Data. http://www.sthda.com/english/upload/hcpc_husson_josse.pdf.