Abstract
This document applies principal component analysis to analyze the correlation between numerical variables (number of pages, size of pages, number of columns, size of columns) and cluster periodicals according to their length, format and layout. The data come from Crow’s Newspaper Directory of China (1931-5).
Load data
library(readr)
crowdata <- read_delim("crowdata.csv", ";",
escape_double = FALSE, trim_ws = TRUE)
List of variables:
names(crowdata)
## [1] "Year" "City_source" "City_zh"
## [4] "City_py" "Population" "Province"
## [7] "Title_eng" "Title_wg" "Title_zh"
## [10] "Title_py" "Established" "Age"
## [13] "Language" "Periodicity" "Circulation_range"
## [16] "Circulation_avrg" "Circulation_min" "Circulation_max"
## [19] "Audited" "Publisher_name" "Publisher_type"
## [22] "Nationality" "Editor" "Address"
## [25] "Page_nbr_variation" "Page_nbr_avrg" "Page_nbr_min"
## [28] "Page_nbr_max" "Page_ratio_x_y" "Page_x"
## [31] "Page_y" "Page_surface" "Col_nbr_variation"
## [34] "Col_nbr_avrg" "Col_nbr_min" "Col_nbr_max"
## [37] "Col_x_variation" "Col_x_avrg" "Col_x_min"
## [40] "Col_x_max" "Col_y_variation" "Col_y_avrg"
## [43] "Col_y_min" "Col_y_max" "Col_y_unit"
## [46] "sourcepg" "Anno_id"
We retain only numeric variables as active variables (number of pages, size of page, number of column, circulation), and two categorical variables as supplementary variables (periodicity, language):
crowpca <- crowdata %>% select(Year, Title_eng, Periodicity, Language,
Circulation_avrg,
Page_nbr_avrg,
Page_surface,
Col_nbr_avrg)
We split the dataset into two samples, one for each directory:
crowpca31 <- crowpca %>% filter(Year == 1931)
crowpca35 <- crowpca %>% filter(Year == 1935)
crowpca31$Year <- NULL
crowpca35$Year <- NULL
Note: There is large proportion of missing values for the number of columns in the 1935 dataset. To solve the problem we may either (1) remove the missing values (but the sample will be very small) (2) ignore the variable.
We create unique ids for each periodical and read title as row names:
For the 1931 directory:
crow31id <- rowid_to_column(crowpca31)
crow31id <- crow31id %>% mutate(id = paste(rowid, Title_eng, sep = "."))
crowidpca31 <- column_to_rownames(crow31id, var = "id")
crowidpca31$rowid <- NULL
crowidpca31$Title_eng <- NULL
datatable(crowidpca31)
For the 1935 directory:
crow35id <- rowid_to_column(crowpca35)
crow35id <- crow35id %>% mutate(id = paste(rowid, Title_eng, sep = "."))
crowidpca35 <- column_to_rownames(crow35id, var = "id")
crowidpca35$rowid <- NULL
crowidpca35$Title_eng <- NULL
datatable(crowidpca35)
We’re all set!
Load necessary packages
library(FactoMineR)
library(Factoshiny)
library(explor)
Apply PCA to the 1931 dataset. We set “Periodicity” and “Language” as supplementary variables:
res.pca31 <- PCA(crowidpca31, quali.sup = 1:2, graph = FALSE)
Extract eigenvalues/variances
get_eig(res.pca31)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 1.6775746 41.93937 41.93937
## Dim.2 1.0122835 25.30709 67.24645
## Dim.3 0.8673537 21.68384 88.93030
## Dim.4 0.4427881 11.06970 100.00000
Visualize eigenvalues/variances
fviz_screeplot(res.pca31, addlabels = TRUE, ylim = c(0, 50))
Extract the results for variables
var31 <- get_pca_var(res.pca31)
var31
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
Coordinates of variables
head(var31$coord)
## Dim.1 Dim.2 Dim.3 Dim.4
## Circulation_avrg 0.2103532 0.88535425 -0.41341156 -0.03146842
## Page_nbr_avrg -0.6093376 0.43711410 0.60035695 0.27786766
## Page_surface 0.8677050 -0.05902853 0.04669417 0.49134839
## Col_nbr_avrg 0.7135277 0.18406056 0.57778525 -0.35094752
Coordinates of variables
head(var31$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4
## Circulation_avrg 2.637646 77.4340500 19.7046617 0.2236423
## Page_nbr_avrg 22.132688 18.8750218 41.5549570 17.4373333
## Page_surface 44.880986 0.3442087 0.2513791 54.5234260
## Col_nbr_avrg 30.348680 3.3467195 38.4890022 27.8155984
Graph of variables (default plot)
fviz_pca_var(res.pca31, col.var = "black")
Graph of variables using colors to represent variables contribution:
fviz_pca_var(res.pca31, col.var="contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
Color variables by group:
# Create a grouping variable using kmeans
# Create 3 groups of variables (centers = 3)
set.seed(123)
res.km31 <- kmeans(var31$coord, centers = 3, nstart = 25)
grp31 <- as.factor(res.km31$cluster)
# Color variables by groups
fviz_pca_var(res.pca31, col.var = grp31,
palette = c("#0073C2FF", "#EFC000FF", "#868686FF"),
legend.title = "Cluster")
Correlation plot (quality of projection):
library(corrplot)
corrplot(var31$cos2, is.corr = FALSE)
Correlation plot (contribution):
corrplot(var31$contrib, is.corr = FALSE)
Contribution of variables to the first dimension:
fviz_contrib(res.pca31, choice = "var", axes = 1, top = 10,
title = "Variables contribution to the first dimension")
Contribution of variables to the second dimension:
fviz_contrib(res.pca31, choice = "var", axes = 2, top = 10, title = "Variables contribution to the second dimension")
Extract the results for individuals
ind31<- get_pca_ind(res.pca31)
ind31
## Principal Component Analysis Results for individuals
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the individuals"
## 2 "$cos2" "Cos2 for the individuals"
## 3 "$contrib" "contributions of the individuals"
Coordinates of individuals
head(ind31$coord)
## Dim.1 Dim.2 Dim.3 Dim.4
## 1.News Paper 2.421559 8.120157 -3.98422975 0.2885717
## 2.Sin Wen Pao Pictorial 1.688944 7.766754 -4.99987393 0.7300622
## 3.Shanghai Herald 2.345387 8.122252 -3.99784024 0.1937234
## 4.Manchu Nippon, The 2.542951 2.725193 0.15364461 -0.7952601
## 5.Eastern Times 2.291486 2.635480 -0.13798812 -0.5907491
## 6.China Times 2.325594 2.659945 -0.08797687 -0.4878067
Individuals quality of projection on the two first dimensions (35 best projected individuals):
fviz_cos2(res.pca31, choice = "ind", axes = 1:2, top = 30) +
coord_flip()
Contribution of individuals on the two first dimensions (35 most contributing):
fviz_contrib(res.pca31, choice = "ind", axes = 1:2, top = 30) +
coord_flip()
Graph of individuals:
fviz_pca_ind(res.pca31, col.ind = "cos2", pointsize = "contrib",
alpha.ind ="contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping (slow if many points)
)
We use repel = TRUE to avoid overplotting. We control the color of individuals according to the quality of the projection on the factor map (cos2) quality of the individuals on the factor map, and we use a color gradient to show the gradual quality of projection. The size and transparency of points is proportionate to their contribution.
Biplot of individuals and variables:
fviz_pca_biplot(res.pca31, col.ind = "cos2",
alpha.ind ="contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
label = "var",
col.var = "black",
repel = TRUE)
Note: transparency of points proportionate to their contribution.
Color individuals by groups (periodicity):
periodicity <- as.factor(crowidpca31[, "Periodicity"])
fviz_mca_ind(res.pca31, habillage = periodicity,
pointsize = "contrib",
alpha.ind ="contrib",
addEllipses = TRUE,
label = "var",
repel = TRUE,
title = "Periodicals by Periodicity (1931)")
Note: the size and transparency of points is proportionate to their contribution.
Biplot with clusters of variables and periodicity groups:
fviz_pca_biplot(res.pca31,
# Fill individuals by groups
geom.ind = "point",
pointshape = 21,
pointsize = 2.5,
fill.ind = crowidpca31$Periodicity,
legend.title = list(fill = "Periodicity", color = "Clusters"),
col.var = factor(c("Circulation", "Length", "Format", "Format")),
repel = TRUE,
title = "Biplot: Periodicity and clustered variables (1931)")
Color individuals by groups (language):
lang <- as.factor(crowidpca31[, "Language"])
fviz_mca_ind(res.pca31, habillage = lang,
alpha.ind ="contrib",
pointsize = "contrib",
addEllipses = TRUE,
label = "none",
repel = TRUE,
title = "Periodicals by Language (1931)")
Note: the size and transparency of points is proportionate to their contribution.
Biplot with cluster of variables and language groups:
fviz_pca_biplot(res.pca31,
# Fill individuals by groups
geom.ind = "point",
pointshape = 21,
pointsize = 2.5,
fill.ind = crowidpca31$Language,
legend.title = list(fill = "Language", color = "Clusters"),
col.var = factor(c("circulation", "length", "format", "format")),
repel = TRUE,
title = "Biplot: Language and clustered variables (1931)")
We perform a hierarchical clustering on all four dimensions:
res.pca31<-PCA(crowidpca31,quali.sup=c(1,2),graph=FALSE)
res.HCPC31<-HCPC(res.pca31,nb.clust=4,consol=FALSE,graph=FALSE)
Cluster dendograms:
fviz_dend(res.HCPC31, show_labels = FALSE,
main = "Cluster dendogram of Chinese periodicals (1931)",
caption = "Based on 'Newspaper Directory of China' (1931)")
Individuals (periodicals) factor map:
fviz_cluster(res.HCPC31, geom = "point",
main = "Factor map of Chinese periodicals (1931)",
caption = "Based on 'Newspaper Directory of China' (1931)")
Summary of statistics:
res.HCPC31$desc.var$test.chi2 # Variables
## p.value df
## Periodicity 1.285764e-55 18
desccat <- res.HCPC31$desc.var$category # by variable categories
res.HCPC31$desc.axes # by principal components
##
## Link between the cluster variable and the quantitative variables
## ================================================================
## Eta2 P-value
## Dim.1 0.7597870 3.045584e-109
## Dim.2 0.6809428 1.902079e-87
## Dim.3 0.5506993 3.530850e-61
## Dim.4 0.4523620 5.247230e-46
##
## Description of each cluster by quantitative variables
## =====================================================
## $`1`
## v.test Mean in category Overall mean sd in category Overall sd
## Dim.3 10.629577 4.928918 -9.739523e-15 0.7108728 0.9313183
## Dim.4 7.972298 2.641308 8.912401e-15 1.3452588 0.6654232
## Dim.2 7.020813 3.517029 -2.466292e-15 0.6707162 1.0061230
## Dim.1 -8.233240 -5.309447 -7.782609e-15 1.4418396 1.2952122
## p.value
## Dim.3 2.170954e-26
## Dim.4 1.557510e-15
## Dim.2 2.205812e-12
## Dim.1 1.822159e-16
##
## $`2`
## v.test Mean in category Overall mean sd in category Overall sd
## Dim.3 -4.785711 -0.2777771 -9.739523e-15 0.7160729 0.9313183
## Dim.4 -10.569341 -0.4383266 8.912401e-15 0.4532466 0.6654232
## Dim.1 -13.362588 -1.0786572 -7.782609e-15 0.6404140 1.2952122
## p.value
## Dim.3 1.703834e-06
## Dim.4 4.133905e-26
## Dim.1 1.000252e-40
##
## $`3`
## v.test Mean in category Overall mean sd in category Overall sd
## Dim.1 14.500668 0.8785098 -7.782609e-15 0.6071938 1.2952122
## Dim.4 8.626844 0.2685144 8.912401e-15 0.4916868 0.6654232
## Dim.3 3.989533 0.1737952 -9.739523e-15 0.5459008 0.9313183
## Dim.2 -3.292986 -0.1549739 -2.466292e-15 0.5148146 1.0061230
## p.value
## Dim.1 1.199774e-47
## Dim.4 6.306810e-18
## Dim.3 6.620360e-05
## Dim.2 9.912942e-04
##
## $`4`
## v.test Mean in category Overall mean sd in category Overall sd
## Dim.2 13.816093 8.003054 -2.466292e-15 0.1670918 1.0061230
## Dim.1 2.885855 2.151963 -7.782609e-15 0.3288776 1.2952122
## Dim.3 -8.070509 -4.327315 -9.739523e-15 0.4756037 0.9313183
## p.value
## Dim.2 2.038377e-43
## Dim.1 3.903522e-03
## Dim.3 7.000587e-16
res.HCPC31$desc.ind$para # by individuals
## Cluster: 1
## 355.Commercial Directory of Shanghai
## 0.7439473
## 38.Current Events
## 1.8143634
## 199.Public Education Monthly
## 1.8159900
## 96.Directory and Chronicle of China, Japan, Straits, Philippines, etc.
## 3.4974420
## ------------------------------------------------------------
## Cluster: 2
## 98.Harbin Noon Press 114.Iron News, The
## 0.5190142 0.5501975
## 312.Wind, The 76.New Province's Daily News
## 0.5717764 0.5735409
## 203.Petty News
## 0.5810084
## ------------------------------------------------------------
## Cluster: 3
## 66.Honan Citizens' Press 81.New Kiangsu Press
## 0.2379443 0.2384903
## 345.People's Daily News 322.People's News
## 0.2416711 0.2418862
## 319.New Eastern Hill Daily News
## 0.2493710
## ------------------------------------------------------------
## Cluster: 4
## 3.Shanghai Herald 1.News Paper 2.Sin Wen Pao Pictorial
## 0.4521510 0.4663188 0.9103843