Data preprocessing

Load data

library(readr)
crowdata <- read_delim("crowdata.csv", ";", 
                       escape_double = FALSE, trim_ws = TRUE)

Select the relevant variables

List of variables:

names(crowdata)

##  [1] "Year"               "City_source"        "City_zh"           
##  [4] "City_py"            "Population"         "Province"          
##  [7] "Title_eng"          "Title_wg"           "Title_zh"          
## [10] "Title_py"           "Established"        "Age"               
## [13] "Language"           "Periodicity"        "Circulation_range" 
## [16] "Circulation_avrg"   "Circulation_min"    "Circulation_max"   
## [19] "Audited"            "Publisher_name"     "Publisher_type"    
## [22] "Nationality"        "Editor"             "Address"           
## [25] "Page_nbr_variation" "Page_nbr_avrg"      "Page_nbr_min"      
## [28] "Page_nbr_max"       "Page_ratio_x_y"     "Page_x"            
## [31] "Page_y"             "Page_surface"       "Col_nbr_variation" 
## [34] "Col_nbr_avrg"       "Col_nbr_min"        "Col_nbr_max"       
## [37] "Col_x_variation"    "Col_x_avrg"         "Col_x_min"         
## [40] "Col_x_max"          "Col_y_variation"    "Col_y_avrg"        
## [43] "Col_y_min"          "Col_y_max"          "Col_y_unit"        
## [46] "sourcepg"           "Anno_id"

We retain only numeric variables as active variables (number of pages, size of page, number of column, circulation), and two categorical variables as supplementary variables (periodicity, language):

crowpca <- crowdata %>% select(Year, Title_eng, Periodicity, Language, 
                               Circulation_avrg, 
                               Page_nbr_avrg, 
                               Page_surface, 
                               Col_nbr_avrg)

We split the dataset into two samples, one for each directory:

crowpca31 <- crowpca %>% filter(Year == 1931)
crowpca35 <- crowpca %>% filter(Year == 1935) 
crowpca31$Year <- NULL
crowpca35$Year <- NULL

Note: There is large proportion of missing values for the number of columns in the 1935 dataset. To solve the problem we may either (1) remove the missing values (but the sample will be very small) (2) ignore the variable.

We create unique ids for each periodical and read title as row names:

For the 1931 directory:

crow31id <- rowid_to_column(crowpca31)
crow31id  <- crow31id  %>% mutate(id = paste(rowid, Title_eng, sep = "."))
crowidpca31 <- column_to_rownames(crow31id, var = "id") 
crowidpca31$rowid <- NULL
crowidpca31$Title_eng <- NULL
datatable(crowidpca31)

For the 1935 directory:

crow35id <- rowid_to_column(crowpca35)
crow35id  <- crow35id  %>% mutate(id = paste(rowid, Title_eng, sep = "."))
crowidpca35 <- column_to_rownames(crow35id, var = "id") 
crowidpca35$rowid <- NULL
crowidpca35$Title_eng <- NULL
datatable(crowidpca35)

We’re all set!

Principal Component Analysis

Load necessary packages

library(FactoMineR)
library(Factoshiny)
library(explor)

Apply PCA to the 1931 dataset. We set “Periodicity” and “Language” as supplementary variables:

res.pca31 <- PCA(crowidpca31, quali.sup = 1:2, graph = FALSE)

Extract eigenvalues/variances

get_eig(res.pca31)

##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  1.6775746         41.93937                    41.93937
## Dim.2  1.0122835         25.30709                    67.24645
## Dim.3  0.8673537         21.68384                    88.93030
## Dim.4  0.4427881         11.06970                   100.00000

Visualize eigenvalues/variances

fviz_screeplot(res.pca31, addlabels = TRUE, ylim = c(0, 50))

Extract the results for variables

var31 <- get_pca_var(res.pca31)
var31

## Principal Component Analysis Results for variables
##  ===================================================
##   Name       Description                                    
## 1 "$coord"   "Coordinates for the variables"                
## 2 "$cor"     "Correlations between variables and dimensions"
## 3 "$cos2"    "Cos2 for the variables"                       
## 4 "$contrib" "contributions of the variables"

Coordinates of variables

head(var31$coord)

##                       Dim.1       Dim.2       Dim.3       Dim.4
## Circulation_avrg  0.2103532  0.88535425 -0.41341156 -0.03146842
## Page_nbr_avrg    -0.6093376  0.43711410  0.60035695  0.27786766
## Page_surface      0.8677050 -0.05902853  0.04669417  0.49134839
## Col_nbr_avrg      0.7135277  0.18406056  0.57778525 -0.35094752

Coordinates of variables

head(var31$contrib)

##                      Dim.1      Dim.2      Dim.3      Dim.4
## Circulation_avrg  2.637646 77.4340500 19.7046617  0.2236423
## Page_nbr_avrg    22.132688 18.8750218 41.5549570 17.4373333
## Page_surface     44.880986  0.3442087  0.2513791 54.5234260
## Col_nbr_avrg     30.348680  3.3467195 38.4890022 27.8155984

Graph of variables (default plot)

fviz_pca_var(res.pca31, col.var = "black")

Graph of variables using colors to represent variables contribution:

fviz_pca_var(res.pca31, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE # Avoid text overlapping
             )

Color variables by group:

# Create a grouping variable using kmeans
# Create 3 groups of variables (centers = 3)
set.seed(123)
res.km31 <- kmeans(var31$coord, centers = 3, nstart = 25)
grp31 <- as.factor(res.km31$cluster)
# Color variables by groups
fviz_pca_var(res.pca31, col.var = grp31,
             palette = c("#0073C2FF", "#EFC000FF", "#868686FF"),
             legend.title = "Cluster")

Correlation plot (quality of projection):

library(corrplot)
corrplot(var31$cos2, is.corr = FALSE)

Correlation plot (contribution):

corrplot(var31$contrib, is.corr = FALSE)

Contribution of variables to the first dimension:

fviz_contrib(res.pca31, choice = "var", axes = 1, top = 10, 
             title = "Variables contribution to the first dimension")

Contribution of variables to the second dimension:

fviz_contrib(res.pca31, choice = "var", axes = 2, top = 10, title = "Variables contribution to the second dimension")

Extract the results for individuals

ind31<- get_pca_ind(res.pca31)
ind31

## Principal Component Analysis Results for individuals
##  ===================================================
##   Name       Description                       
## 1 "$coord"   "Coordinates for the individuals" 
## 2 "$cos2"    "Cos2 for the individuals"        
## 3 "$contrib" "contributions of the individuals"

Coordinates of individuals

head(ind31$coord)

##                            Dim.1    Dim.2       Dim.3      Dim.4
## 1.News Paper            2.421559 8.120157 -3.98422975  0.2885717
## 2.Sin Wen Pao Pictorial 1.688944 7.766754 -4.99987393  0.7300622
## 3.Shanghai Herald       2.345387 8.122252 -3.99784024  0.1937234
## 4.Manchu Nippon, The    2.542951 2.725193  0.15364461 -0.7952601
## 5.Eastern Times         2.291486 2.635480 -0.13798812 -0.5907491
## 6.China Times           2.325594 2.659945 -0.08797687 -0.4878067

Individuals quality of projection on the two first dimensions (35 best projected individuals):

fviz_cos2(res.pca31, choice = "ind", axes = 1:2, top = 30) + 
  coord_flip()

Contribution of individuals on the two first dimensions (35 most contributing):

fviz_contrib(res.pca31, choice = "ind", axes = 1:2, top = 30) + 
  coord_flip()

Graph of individuals:

fviz_pca_ind(res.pca31, col.ind = "cos2", pointsize = "contrib", 
             alpha.ind ="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE # Avoid text overlapping (slow if many points)
             )

We use repel = TRUE to avoid overplotting. We control the color of individuals according to the quality of the projection on the factor map (cos2) quality of the individuals on the factor map, and we use a color gradient to show the gradual quality of projection. The size and transparency of points is proportionate to their contribution.

Biplot of individuals and variables:

fviz_pca_biplot(res.pca31, col.ind = "cos2",
             alpha.ind ="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             label = "var", 
             col.var = "black",
             repel = TRUE)

Note: transparency of points proportionate to their contribution.

Color individuals by groups (periodicity):

periodicity <- as.factor(crowidpca31[, "Periodicity"])
fviz_mca_ind(res.pca31,  habillage = periodicity, 
             pointsize = "contrib", 
             alpha.ind ="contrib",
             addEllipses = TRUE,
             label = "var", 
             repel = TRUE, 
             title = "Periodicals by Periodicity (1931)")

Note: the size and transparency of points is proportionate to their contribution.

Biplot with clusters of variables and periodicity groups:

fviz_pca_biplot(res.pca31, 
                # Fill individuals by groups
                geom.ind = "point",
                pointshape = 21,
                pointsize = 2.5,
                fill.ind = crowidpca31$Periodicity,
                legend.title = list(fill = "Periodicity", color = "Clusters"),
                col.var = factor(c("Circulation", "Length", "Format", "Format")),
                repel = TRUE, 
                title = "Biplot: Periodicity and clustered variables (1931)")

Color individuals by groups (language):

lang <- as.factor(crowidpca31[, "Language"])
fviz_mca_ind(res.pca31,  habillage = lang, 
             alpha.ind ="contrib",
             pointsize = "contrib",
             addEllipses = TRUE, 
             label = "none", 
             repel = TRUE, 
             title = "Periodicals by Language (1931)")

Note: the size and transparency of points is proportionate to their contribution.

Biplot with cluster of variables and language groups:

fviz_pca_biplot(res.pca31, 
                # Fill individuals by groups
                geom.ind = "point",
                pointshape = 21,
                pointsize = 2.5,
                fill.ind = crowidpca31$Language,
                legend.title = list(fill = "Language", color = "Clusters"),
                col.var = factor(c("circulation", "length", "format", "format")),
                repel = TRUE, 
                title = "Biplot: Language and clustered variables (1931)")

HCPC

We perform a hierarchical clustering on all four dimensions:

res.pca31<-PCA(crowidpca31,quali.sup=c(1,2),graph=FALSE)
res.HCPC31<-HCPC(res.pca31,nb.clust=4,consol=FALSE,graph=FALSE)

Cluster dendograms:

fviz_dend(res.HCPC31, show_labels = FALSE, 
          main = "Cluster dendogram of Chinese periodicals (1931)", 
          caption = "Based on 'Newspaper Directory of China' (1931)")

Individuals (periodicals) factor map:

fviz_cluster(res.HCPC31, geom = "point", 
             main = "Factor map of Chinese periodicals (1931)", 
             caption = "Based on 'Newspaper Directory of China' (1931)")

Summary of statistics:

res.HCPC31$desc.var$test.chi2 # Variables

##                  p.value df
## Periodicity 1.285764e-55 18

desccat <- res.HCPC31$desc.var$category # by variable categories
res.HCPC31$desc.axes # by principal components

## 
## Link between the cluster variable and the quantitative variables
## ================================================================
##            Eta2       P-value
## Dim.1 0.7597870 3.045584e-109
## Dim.2 0.6809428  1.902079e-87
## Dim.3 0.5506993  3.530850e-61
## Dim.4 0.4523620  5.247230e-46
## 
## Description of each cluster by quantitative variables
## =====================================================
## $`1`
##          v.test Mean in category  Overall mean sd in category Overall sd
## Dim.3 10.629577         4.928918 -9.739523e-15      0.7108728  0.9313183
## Dim.4  7.972298         2.641308  8.912401e-15      1.3452588  0.6654232
## Dim.2  7.020813         3.517029 -2.466292e-15      0.6707162  1.0061230
## Dim.1 -8.233240        -5.309447 -7.782609e-15      1.4418396  1.2952122
##            p.value
## Dim.3 2.170954e-26
## Dim.4 1.557510e-15
## Dim.2 2.205812e-12
## Dim.1 1.822159e-16
## 
## $`2`
##           v.test Mean in category  Overall mean sd in category Overall sd
## Dim.3  -4.785711       -0.2777771 -9.739523e-15      0.7160729  0.9313183
## Dim.4 -10.569341       -0.4383266  8.912401e-15      0.4532466  0.6654232
## Dim.1 -13.362588       -1.0786572 -7.782609e-15      0.6404140  1.2952122
##            p.value
## Dim.3 1.703834e-06
## Dim.4 4.133905e-26
## Dim.1 1.000252e-40
## 
## $`3`
##          v.test Mean in category  Overall mean sd in category Overall sd
## Dim.1 14.500668        0.8785098 -7.782609e-15      0.6071938  1.2952122
## Dim.4  8.626844        0.2685144  8.912401e-15      0.4916868  0.6654232
## Dim.3  3.989533        0.1737952 -9.739523e-15      0.5459008  0.9313183
## Dim.2 -3.292986       -0.1549739 -2.466292e-15      0.5148146  1.0061230
##            p.value
## Dim.1 1.199774e-47
## Dim.4 6.306810e-18
## Dim.3 6.620360e-05
## Dim.2 9.912942e-04
## 
## $`4`
##          v.test Mean in category  Overall mean sd in category Overall sd
## Dim.2 13.816093         8.003054 -2.466292e-15      0.1670918  1.0061230
## Dim.1  2.885855         2.151963 -7.782609e-15      0.3288776  1.2952122
## Dim.3 -8.070509        -4.327315 -9.739523e-15      0.4756037  0.9313183
##            p.value
## Dim.2 2.038377e-43
## Dim.1 3.903522e-03
## Dim.3 7.000587e-16

res.HCPC31$desc.ind$para # by individuals

## Cluster: 1
##                                   355.Commercial Directory of Shanghai 
##                                                              0.7439473 
##                                                      38.Current Events 
##                                                              1.8143634 
##                                           199.Public Education Monthly 
##                                                              1.8159900 
## 96.Directory and Chronicle of China, Japan, Straits, Philippines, etc. 
##                                                              3.4974420 
## ------------------------------------------------------------ 
## Cluster: 2
##         98.Harbin Noon Press           114.Iron News, The 
##                    0.5190142                    0.5501975 
##                312.Wind, The 76.New Province's Daily News 
##                    0.5717764                    0.5735409 
##               203.Petty News 
##                    0.5810084 
## ------------------------------------------------------------ 
## Cluster: 3
##        66.Honan Citizens' Press            81.New Kiangsu Press 
##                       0.2379443                       0.2384903 
##         345.People's Daily News               322.People's News 
##                       0.2416711                       0.2418862 
## 319.New Eastern Hill Daily News 
##                       0.2493710 
## ------------------------------------------------------------ 
## Cluster: 4
##       3.Shanghai Herald            1.News Paper 2.Sin Wen Pao Pictorial 
##               0.4521510               0.4663188               0.9103843

References

http://www.sthda.com/english/wiki/factoextra-r-package-easy-multivariate-data-analyses-and-elegant-visualization

https://rstudio-pubs-static.s3.amazonaws.com/323416_ab58ad22d9e64ba2831569cf3d14a609.html

http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/117-hcpc-hierarchical-clustering-on-principal-components-essentials/

Crow’s Newspaper Directory of China (1931-5)

Principal Component Analysis

Cécile Armand

2021-08-06

Data preprocessing

Select the relevant variables

Principal Component Analysis

HCPC

References