點我看教學影片 (ctrl + click)
rm(list=ls(all=T))
options(digits=4, scipen=40)
library(dplyr)
load("data/Biz.rdata")
load("data/Rev.rdata")
LOAD = TRUE
if(LOAD) { load("data/tsne.rdata") }
rev
- 評論資料框評論資料框的表頭 + 評論資料框包括評論id、商店id、評論星等、評論時間、評論情緒等等資訊。
head(rev)
rid bid date stars cool funny useful anger anticipation disgust fear joy sadness
1 1 154938 2014-02-17 4 0 0 1 1 1 0 0 3 0
2 2 36983 2016-07-24 4 0 0 0 0 1 1 0 1 0
3 3 103253 2012-04-07 5 0 0 2 0 1 0 0 1 0
4 4 25101 2015-09-11 5 0 0 0 0 1 1 0 0 0
5 5 182709 2016-01-22 5 1 1 1 0 1 0 0 1 0
6 6 135863 2014-09-17 5 0 0 1 0 2 1 0 6 0
surprise trust negative positive business_id user_id nchar year
1 0 3 1 3 Ue6-WhXvI-_1xUIuapl0zQ gVmUR8rqUFdbSeZbsg6z_w 322 2014
2 1 1 1 3 Ae4ABFarGMaI5lk1i98A0w Y6qylbHq8QJmaCRSlKdIog 137 2017
3 1 1 0 1 lKq4Qsz13FDcAVgp49uukQ SnXZkRN9Yf060pNTk1HMDg 108 2012
4 0 0 1 1 6nKR80xEGHYf2UxAe_Cu_g VcmSgvslHAhqWoEn16wjjw 113 2016
5 0 1 0 2 Z_mJYg3vi8cPZHa1J4BALw NKF9v-r0jd1p0JVi9h2T1w 200 2016
6 1 5 0 7 R1PQEK6qvrZVC9qcWfKvDA c2MQ_LPuvtiiKFR_-OY9pg 326 2015
n_distinct(rev$user_id) # no. user = 1518169
[1] 1518169
n_distinct(rev$bid) # no. biz = 188593
[1] 188593
sapply(c(5, 10, 50, 100), function(i) sum(rev$nchar <= i))
[1] 69 163 8630 177993
breaks = as.Date(c("2004-07-02", paste0(2005:2018, "-07-02"))) # 以07/02這一天來切分年份
rev$year = as.integer(cut(rev$date, breaks)) + 2004 # 新增year欄位
par(cex=0.8, mfrow=c(1,3), mar=c(7,5,4,2))
table(rev$year) %>%
barplot(las=2, main="#Reviews by Year(cut at Jul02)",
xlab="", ylab="")
table(rev$stars) %>% barplot(main="No. Stars")
hist(rev$nchar, main="No.Characters")
# average scores
df = aggregate(cbind(stars,cool,funny,useful) ~ year, data = rev, FUN = mean)
par(cex=0.8, mfrow=c(1,4), mar=c(3,4,4,1))
mapply(barplot, df[2:5], main=names(df)[2:5], las=2)
stars cool funny useful
[1,] 0.7 0.7 0.7 0.7
[2,] 1.9 1.9 1.9 1.9
[3,] 3.1 3.1 3.1 3.1
[4,] 4.3 4.3 4.3 4.3
[5,] 5.5 5.5 5.5 5.5
[6,] 6.7 6.7 6.7 6.7
[7,] 7.9 7.9 7.9 7.9
[8,] 9.1 9.1 9.1 9.1
[9,] 10.3 10.3 10.3 10.3
[10,] 11.5 11.5 11.5 11.5
[11,] 12.7 12.7 12.7 12.7
[12,] 13.9 13.9 13.9 13.9
[13,] 15.1 15.1 15.1 15.1
[14,] 16.3 16.3 16.3 16.3
user
- 評論人資料框user = rev %>% group_by(user_id) %>% summarise(
n = n(),
star = mean(stars),
funny = mean(funny),
useful = mean(useful),
cool = mean(useful)
)
save(user, file="data/User.rdata", compress=T)
par(cex=0.8, mfrow=c(1,2), mar=c(5,5,4,2))
hist(log(user$n), main="No. Reviews per User (log)") # y軸為頻率
hist(user$star, main="Avg. Stars per User (log)")
par(cex=0.8, mfrow=c(1,3), mar=c(5,5,4,2))
hist(pmin(user$funny,10), main="Avg. Funny's per User")
hist(pmin(user$cool,10), main="Avg. Cool's per User")
hist(pmin(user$useful,10), main="Avg. Useful's per User")
X
- 商店類別矩陣 Biz-Category MatrixX
- BC matrix, 1306 categoriesdim(X)
Loading required package: Matrix
NULL
par(cex=0.8, mar=c(3,4,4,2))
rowSums(X) %>% table %>% head(10) %>% barplot(main="No. Categoy per Biz")
par0 = par(cex=0.7, mar=c(11,4.5,3,0))
colSums(X)[1:40] %>% barplot(las=2, main="Top 40 Biz Category")
X
- dense BC matrix, 936 categoriesX = X[,colSums(X) > 20]
dim(X) # 188593 936
[1] 188593 936
identical(B$business_id, rownames(X)) # TRUE
[1] TRUE
C
- 商業類別摘要C = apply(X, 2, function(v) c(sum(v), sum(B[v,]$review_count)))
C = C %>% t %>% data.frame %>% setNames(c("n_biz", "n_rev")) %>%
mutate(a_rev = n_rev/n_biz)
Warning: `as_dictionary()` is soft-deprecated as of rlang 0.3.0.
Please use `as_data_pronoun()` instead
This warning is displayed once per session.
Warning: `new_overscope()` is soft-deprecated as of rlang 0.2.0.
Please use `new_data_mask()` instead
This warning is displayed once per session.
Warning: The `parent` argument of `new_data_mask()` is deprecated.
The parent of the data mask is determined from either:
* The `env` argument of `eval_tidy()`
* Quosure environments when applicable
This warning is displayed once per session.
Warning: `overscope_clean()` is soft-deprecated as of rlang 0.2.0.
This warning is displayed once per session.
C$name = colnames(X)
sapply(list(X=X, B=B, C=C), dim)
X B C
[1,] 188593 188593 936
[2,] 936 80 4
X
的尺度 [188593 x 936] 縮減為 [2 x 936] …library(RColorBrewer)
library(wordcloud)
library(Rtsne)
if(!LOAD) {
t0 = Sys.time()
set.seed(123)
tsneCat = Rtsne(as.matrix(t(X)), check_duplicates=F, theta=0.0, max_iter=3000)
Sys.time() - t0 # 3.857 mins
}
Y = tsneCat$Y # tSNE coordinates
d = dist(Y) # distance matrix
hc = hclust(d) # hi-clustering
K = 80 # number of clusters
C$group = g = cutree(hc,K) # cut into K clusters
table(g) %>% as.vector %>% sort # sizes of clusters
[1] 2 2 4 4 4 4 4 4 5 6 6 6 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 9 9 9 9
[33] 9 10 10 10 10 10 10 10 11 11 11 11 11 11 11 12 12 13 13 13 14 14 14 15 15 15 15 15 15 16 16 16
[65] 16 17 17 17 17 18 18 18 20 20 21 23 23 24 28 31
C$n_rev
的範圍sz = 0.7 + sqrt(C$n_rev)/500
range(sz)
[1] 0.7211 4.5235
png("fig/category.png", width=3200, height=1800)
textplot(Y[,1], Y[,2], C$name, font=2,
col = randomcoloR::distinctColorPalette(K)[g],
cex = sz ) # size by no. reviews
dev.off()
png
2
將字雲畫在category.png裡面:
我們分別使用了尺度縮減和集群分析來做以上的字雲,其中 …
■ 尺度縮減的
● 原始尺度有多少個?
● 縮減之後剩下多少尺度?
● 原始尺度是什麼?換句話說,我們是根據甚麼來做尺度縮減?
■ 我們是根據什麼做的集群分析?
● 是原始尺度、還是縮減之後的尺度?
● 用原始和縮減尺度、會有什麼差別?
使用字雲觀察評論話題(Theme)之間的相似性
接下來考慮評論的話題,我們已經預先使用Stanford的Empath Text Classifier,依其預設的194種內容(Class), 對這5,996,996篇評論分別做過評分,文集之中的每一篇評論都有194個內容評分,放在 data/Tmx.rdata
裡面;由於資料太大,我們先依商店和評論人分別對話題全做過平均。
load("data/Tmx.rdata")
if(LOAD) { load("data/BCscores.rdata") } else
{
t0 = Sys.time()
cat_senti = apply(X, 2, function(v) colMeans( rev[rev$bid %in% B$bid[v], 8:17] ) ) %>% t
Sys.time() - t0
biz_senti = aggregate(. ~ bid, data=rev[, c(2,8:17)], mean)
Sys.time() - t0
cat_theme = apply(X, 2, function(v) colMeans( Tmx[rev$bid %in% B$bid[v],] ) ) %>% t
Sys.time() - t0
biz_theme = aggregate(as.data.frame.matrix(Tmx), list(bid = rev$bid), mean)
Sys.time() - t0
gc()
save(biz_senti, biz_theme, cat_senti, cat_theme, file="data/BCscores", compress=T)
}
biz_senti
- 每一個商業類別之平均情緒分數
dim(biz_senti) # biz_senti維度
[1] 188593 11
biz_theme
- 每一個商業類別之平均話題主題權重
dim(biz_theme) # biz_theme維度
[1] 188593 195
themes = data.frame(name=colnames(Tmx), weight=colSums(Tmx), stringsAsFactors=F)
par(mar=c(8,4,4,2), cex=0.7)
colSums(Tmx)[1:20] %>% barplot(main="Sums of Empath Scores, Top20 Themes", las=2)
par(mar=c(2,4,4,2), cex=0.7)
themes$weight %>% barplot(main="Sums of Empath Scores")
if(!LOAD) {
t0 = Sys.time()
set.seed(123)
tsneTheme = biz_theme[,-1] %>% scale %>% as.matrix %>% t %>%
Rtsne(check_duplicates=F, theta=0.0, max_iter=3000)
Sys.time() - t0 # 29.21 secs
}
Y = tsneTheme$Y # tSNE coordinates
d = dist(Y) # distance matrix
hc = hclust(d) # hi-clustering
K = 40 # number of clusters
themes$group = g = cutree(hc,K) # clustering for color
table(g) %>% as.vector %>% sort # size of clusters
[1] 2 2 3 3 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 7 7 7 7 7 8
sz = sqrt(themes$weight)/100 + 1.5
range(sz)
[1] 1.591 5.122
png("fig/theme.png", width=3200, height=1800)
textplot(Y[,1], Y[,2], themes$name, font=2,
col = randomcoloR::distinctColorPalette(K)[g], # color by group
cex = sz ) # size by total weight
dev.off()
png
2
TC
- Theme-Category Matrixlibrary(d3heatmap)
#TC = apply(X, 2, function(i) 100*colMeans(E[i > 0,]) )
#dim(TC)
#sapply(list(TC, colSums(TC), rowSums(TC)), range)
library(d3heatmap)
cat_theme[1:100, 4:80] %>% t %>% d3heatmap(colors = cm.colors(13)[3:13])
# rev(brewer.pal(11,"Spectral"))
# 這邊取其中350個商業類別對194個話題主題來生成熱圖
# 可以用以觀察某商業類別之中,話題主題集中在哪邊
cat_theme[1:350, 1:194] %>% scale %>% d3heatmap(
show_grid=F,
xaxis_font_size = "0px", xaxis_height = 10,
yaxis_font_size = "0px", yaxis_width = 10,
colors = brewer.pal(9,"Greens")
)
# library(morpheus)
# cat_theme[1:200, 4:100] %>% t %>% morpheus
#x = sapply(1:max(C$group), function(i) rowSums(X[,C$group == i]) > 0)
#x = apply(x, 2, function(i) 100 * colMeans(E[i,]))
#x = sapply(1:max(theme$group), function(i)
# colMeans(x[theme$group==i,] ) )
#sapply(list(TC, colSums(TC), rowSums(TC)), range)
#x %>% scale %>% t %>% d3heatmap(
# scale="none",
# colors = rev(brewer.pal(11,"Spectral")))
biz_senti
- 商店的平均情緒分數前面介紹了評論話題的相關性,接下來,我們需要了解顧客對商店的評論中,多傾向哪一種情緒;首先,我們可以使用summary()、盒狀圖、相關係數圖,查看10種評論情緒的分布及關係。
S = biz_senti[,-1]
summary(S)
anger anticipation disgust fear joy sadness
Min. :0.000 Min. : 0.00 Min. :0.000 Min. : 0.000 Min. : 0.00 Min. : 0.000
1st Qu.:0.500 1st Qu.: 1.80 1st Qu.:0.333 1st Qu.: 0.500 1st Qu.: 1.73 1st Qu.: 0.582
Median :0.762 Median : 2.33 Median :0.623 Median : 0.821 Median : 2.43 Median : 0.898
Mean :0.882 Mean : 2.48 Mean :0.703 Mean : 0.965 Mean : 2.54 Mean : 1.024
3rd Qu.:1.143 3rd Qu.: 3.00 3rd Qu.:0.958 3rd Qu.: 1.250 3rd Qu.: 3.20 3rd Qu.: 1.333
Max. :9.000 Max. :13.33 Max. :8.000 Max. :13.667 Max. :16.00 Max. :10.667
surprise trust negative positive
Min. :0.000 Min. : 0.00 Min. : 0.00 Min. : 0.00
1st Qu.:0.692 1st Qu.: 2.33 1st Qu.: 1.27 1st Qu.: 4.00
Median :1.000 Median : 3.00 Median : 1.88 Median : 5.00
Mean :1.092 Mean : 3.13 Mean : 2.10 Mean : 5.28
3rd Qu.:1.368 3rd Qu.: 3.73 3rd Qu.: 2.67 3rd Qu.: 6.29
Max. :8.000 Max. :19.33 Max. :18.67 Max. :31.00
#平均情緒盒狀圖
par(cex=0.8, mar=c(6,4,4,2))
boxplot(S, las=2, main="Avg. Sentiment per Biz")
#載入相關係數圖的library- corrplot
library(corrplot)
corrplot 0.84 loaded
#情緒相關係數圖
par(cex=0.7)
corrplot.mixed(cor(S))
CS
(商業類別x情緒) $ CT
(商業類別x話題) 矩陣接著我們將評論情緒評分(10項)和評論主題評分(194項),依照商業類別(936類) 進行平均,分別放在:
CS [936 x 10]
: 每個商業類別的10個平均情緒分數CT [936 x 194]
: 每個商業類別的194個評論主題評分這兩個矩陣裡面:
#商業類別與情緒矩陣
CS = cat_senti
#商業類別與評論主題矩陣
CT = cat_theme
dim(CS); dim(CT)
[1] 936 10
[1] 936 194
由於有10種情緒,意味著會需要呈現多維度的分布資料,然而我們較容易觀察的方式是二維狀態,若要將多維度合併成二維度,合併過程中會遺失部分資料,因此,我們先對情緒矩陣(sx)做主成份分析,找出能涵蓋最多資料含量的角度。
library(FactoMineR)
library(factoextra)
Loading required package: ggplot2
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(highcharter)
Highcharts (www.highcharts.com) is a Highsoft software product which is
not free for commercial and Governmental use
# 需保留的主要元件有多少
ncp=10
pcx = PCA(CS, ncp=ncp, graph=F)
par(cex=0.8)
barplot(pcx$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')
根據上圖,前兩個主成份就涵蓋了80%的變異量(資料量),因此我們利用前兩個維度進行繪製。然而,當我們想要將商業類別標示在前兩個主成份的平面上的時候,便會發生以下狀況:
par(cex=0.7)
fviz_pca_biplot(pcx)
商業類別疊成一團,不容易閱讀,因此下面我們會介紹一個繪製PCA圖的繪圖套件。
近兩年來,R的繪圖套件幾乎都具備了輸出互動網頁的能力,以下我們先寫一個helper function,來幫助我們更容易檢視主成份分析結果。
source("bipcx.R")
藉由bipcx()裡的function,我們可以清楚的看到商業類別(由於許多商業類別的評論數不多,在此我們只繪製前400個商業類別),它們在第一、二主成份的表現。 第一、第二主成份PCA圖
bipcx(pcx,1,2,10,400,t1="Strength",t2="Valence",obs='Biz Category',
main="PCA on Sentiment Scores",ratio=0.5)
第二、三主成份PCA圖
bipcx(pcx,3,2,10,300,t1="Arousal",t2="Valence",obs='Biz Category',
main="PCA on Sentiment Scores")
以這三種表現來查看商業類別的情緒表現,我們可以了解到每一種商業類別對於顧客而言,偏向哪些情緒類型、情緒正負向以及影響顧客的程度。
現在我們針對討論話題與商業類別進行主成份分析,由於話題矩陣的尺度(194)比情緒矩陣(10)大很多,即使我們只挑前600個商業類別和前50個評論主題項目,呈現的主成份區段很多。
ncp=30
#選擇評論數量較多的商業類別與話題
pcx = PCA(CT[1:600, 1:50],ncp=ncp,graph=F)
par(cex=0.8)
barplot(pcx$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
xlab="No. Components", ylab="% of Variance", las=2)
abline(h=seq(0,100,10),col='lightgray') # 12 PC's cover ~75% of variance
根據上圖所示,做完主成份分析之後,前4個主成份只有涵蓋60%的變異量,反而需要更多的主成份,才能涵蓋大部分的資料內容。因此,在這種資料點和尺度都很多的狀況之下,互動式的圖表更能幫助我們觀察到原始尺度和資料點之間的關係。
以下我們將前幾個主成份,以兩兩成對的方式,分別畫出在該平面上變異最大的20個話題和200個商業類別,在這些平面上,我們其實可以看到一些不容易從簡單的敘事統計中所看出來的關係,並根據不同的主成份組合,觀察不同角度的資料分佈狀況。
#輸出第一、二主成份所組成的PCA圖,選擇前200個變異最大的商業類別與變異最大的20個話題
bipcx(pcx,1,2,20,200,obs='Biz Category',
main="PCA on LIWC Classes, Dim. 1 & 2",ratio=0.5)
#輸出第三、四主成份所組成的PCA圖,選擇前200個變異最大的商業類別與變異最大的20個話題
bipcx(pcx,3,4,20,200,obs='Biz Category',
main="PCA on LIWC Classes, Dim. 3 & 4")
#輸出第一、三主成份所組成的PCA圖,選擇前200個變異最大的商業類別與變異最大的20個話題
bipcx(pcx,1,3,20,200,obs='Biz Category',
main="PCA on LIWC Classes, Dim. 1 & 3")
#輸出第二、四主成份所組成的PCA圖,選擇前200個變異最大的商業類別與變異最大的20個話題
bipcx(pcx,2,4,20,200,obs='Biz Category',
main="PCA on LIWC Classes, Dim. 2 & 4")
categories = C
save(biz_senti, biz_theme, cat_senti, cat_theme, categories, themes,
file="data/BCscores.rdata", compress=T)
save(tsneCat, tsneTheme, file="data/tsne.rdata", compress=T)