點我看教學影片 (ctrl + click)
##設時區,字元集
Sys.setlocale(category="LC_ALL", locale = "en_US.UTF-8")
## [1] "LC_CTYPE=en_US.UTF-8;LC_NUMERIC=C;LC_TIME=en_US.UTF-8;LC_COLLATE=en_US.UTF-8;LC_MONETARY=en_US.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
rm(list=ls(all=T)); gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 596633 31.9 940480 50.3 750400 40.1
## Vcells 1060431 8.1 2060183 15.8 1296330 9.9
library(knitr); opts_chunk$set(comment = NA)
library(Matrix)
library(dplyr)
library(googleVis)
library(doParallel)
library(Rtsne)
library(wordcloud)
library(RColorBrewer)
library(randomcoloR)
library(d3heatmap)
library(morpheus)
library(FactoMineR)
library(factoextra)
library(highcharter)
library(tm)
library(slam)
library(MASS)
options(width=100, gvis.plot.tag='chart', digits=4, scipen=60)
#options(width=100, digits=4, scipen=60)
load("data/Biz.rdata")
load("data/Rev.rdata")
load("data/BCscores.rdata")
資料前處理
b100 = rev %>% count(bid) %>% arrange(desc(n)) %>% head(100) %>% .$bid
資料前處理
df = rev %>% filter(bid %in% b100) %>%
group_by(bid, year) %>%
summarise(
positive = mean(positive),
negative = mean(negative),
sad = mean(sadness),
n = n()
)
資料前處理
y09 = df %>% filter(year >= 2009) %>% count(bid) %>% filter(n == 10) %>% .$bid
<重點提醒> 需從google設定 > 進階設定 > 內容設定 > 允許flash開啟,才能看到Google Motion Chart哦
df %>% filter(year >= 2010 & year <= 2017 & bid %in% y09) %>%
data.frame %>%
gvisMotionChart("bid", "year") %>% plot
#
# This chunk should be executed in high-end server
#
# library(doParallel)
# detectCores()
# cores <- makeCluster(4)
# registerDoParallel(cores)
# getDoParWorkers()
# t0 = Sys.time()
# cat936 = foreach(i=1:936, .combine=rbind, .packages="Matrix") %dopar% {
# rx = rev[rev$bid %in% B$bid[X[,i]],]
# cbind(
# category = colnames(X)[i],
# n = as.integer( table(rx$year) ),
# aggregate(. ~ year, rx[,c(4:17,20,21)], mean)
# ) }
# Sys.time() - t0 # 21.229 mins
# stopCluster(cores)
#
# CatYr = cat936 %>% mutate(
# sentiment = positive - negative,
# engagement = log(1 + cool + funny + useful)
# ) %>%
# group_by(year) %>% mutate(
# z_sentiment = scale(sentiment),
# z_engagement = scale(engagement)
# ) %>% ungroup %>% data.frame
# save(CatYr, file="data/CatYr.rdata",compress=T)
#
#把每個商業類別裡所有的商店抓出來,再這些rev裡面的欄位都平均起來
#由於要把所有商業類別做完需花很久的時間,所以利用大數據平台的平行運算
load("data/CatYr.rdata")
以餐廳為例子(fast food逐年下滑)
bizcats = function(cats, yr) {
CatYr %>% as_tibble %>%
filter(year >= yr) %>%
filter(category %in% cats) %>%
dplyr::select(
category, year,
z_sentiment, stars, z_engagement, n,
engagement, sentiment, useful, cool, funny,
positive, joy, anticipation, trust, surprise,
negative, sadness, anger, disgust, fear
) %>%
data.frame %>% gvisMotionChart("category", "year") %>% plot()
}
# bizcats(colnames(X)[1:10], 2009)
bizcats(colnames(X)[11:40], 2009)
# bizcats(colnames(X)[41:70], 2009)
# bizcats(colnames(X)[71:100], 2009)
選擇在
bid
放在Beauty & Spas
裡面spas = rev %>%
filter(bid %in% B$bid[X[,"Beauty & Spas"]]) %>%
filter(year >= 2011) %>%
group_by(bid) %>%
summarise(
n_year = n_distinct(year), #
n_rev = n() #
) %>%
filter(
n_year == 8,
n_rev > 200 & n_rev < 500
) %>%
dplyr::select(bid) %>%
left_join(B)
Joining, by = "bid"
nrow(spas)
[1] 44
選出這一些商店在2011年之後的所有評論,放在資料框R
裡面
R = rev %>% filter(bid %in% spas$bid & year >= 2011)
par(mar=c(3,4,3,2), cex=0.8)
R %>% count(bid) %>% .$n %>% hist(8, main="No. Reviews per Biz")
每年的評論聲量
par(mar=c(3,4,3,2), cex=0.8)
R$year %>% table %>% barplot(main="No. Review by Year")
Y軸是星等(評論者評分給商店的星等);X軸是Engage(有多少互動:有多少人在看在分享多少人在按讚)Engage=useful+funny+cool;color是情緒(這評論裡面使用的語言是正面還是負面的,這裡所看到的分數已經做過平均了);n是聲量
R %>%
filter(year >= 2014) %>%
filter(! bid %in% c(173534,63170)) %>%
group_by(bid, year) %>%
mutate(
engage = log(1 + useful + cool + funny),
senti = positive - negative
) %>%
summarise(
engage = mean(engage),
stars = mean(stars),
senti = mean(senti),
n = n() ) %>%
ungroup %>% group_by(year) %>%
mutate(
z_senti = scale(senti),
z_engage = scale(engage)
) %>% ungroup %>%
left_join(B[,c("bid","name")]) %>% data.frame -> df
Joining, by = "bid"
chain = table(df$name) %>% .[. > 5] %>% names
i = df$name %in% chain
df$name[i] = paste(df$name[i], df$bid[i])
df = df[,c("name", "year", "senti", "stars", "engage", "n",
"z_senti","z_engage","bid")]
df %>% gvisMotionChart("name", "year") %>% plot()
使用線性回歸找出星等上升、下降最快的商店
library(stringr)
bx = sapply(split(df, df$bid), function(x) {
lm(stars ~ year, x) %>% coef %>% `[`("year") }) %>% # take lm coef.
.[ abs(.) > 0.15 ] %>% #
names %>% str_extract("[0-9]*") %>%
as.integer
df %>% filter(bid %in% bx) %>% dplyr::select(-bid) %>%
gvisMotionChart("name", "year") %>% plot()
load("data/Tmx.rdata")
empath = cbind(R[,"rid"], as.data.frame.matrix(Tmx[R$rid,]))
rm(Tmx); gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 3631836 194 5684620 303.6 5684620 303.6
Vcells 144191051 1100 437642322 3339.0 370851918 2829.4
# Make biz-theme matrix - `mx`
mx = sapply(split(empath[,-1], R$bid), colMeans) %>% t # biz-theme matrix
bnames = B$name[match(rownames(mx), B$bid)]
rownames(mx) = bnames
# Select themes with medium-high weights
rx = colSums(mx) %>% quantile(c(0.5, 1.0)) #
mx = mx[, colSums(mx) > rx[1] & colSums(mx) < rx[2]] #
# Check & adjust the range of weights
par(cex=0.8)
hist(log(mx+1e-4))
library(d3heatmap)
mx %>% {log(.+1e-4)} %>% t %>% d3heatmap(color=cm.colors(17))
這個工具不能投射在網頁上,只能直接在Rstudio裡面做
# library(morpheus)
# mx %>% {log(.+1e-4)} %>% morpheus
將商店投射到尺度縮減之後的情緒平面上 ##### 7.1 建立商店情緒矩陣
pcx = sapply(split(R[,8:17], R$bid), colMeans) %>% t
rownames(pcx) = bnames
library(FactoMineR)
library(factoextra)
pcx = pcx %>% scale %>% PCA(ncp=10, graph=F)
par(cex=0.8)
barplot(pcx$eig[1:10,3],names=1:10,main="Accumulated Variance",
xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')
前三個主成分已經涵蓋了90%的變異
source("bipcx.R")
N = n_distinct(R$bid)
bipcx(pcx,1,2,10,N,t1="Strength",t2="Valence",
obs='Business', main="Strength & Valence of Sentiment")
Warning: group_by_() is deprecated.
Please use group_by() instead
The 'programming' vignette or the tidyeval book can help you
to program with group_by() : https://tidyeval.tidyverse.org
This warning is displayed once per session.
bipcx(pcx,3,2,10,N,t1="Arosual",t2="Valence",
obs='Business', main="Strength & Valence of Sentiment")
load("data/Txt.rdata")
txt = Txt[R$rid]
rm(Txt); gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 3703314 197.8 9968622 532.4 9698593 518
Vcells 151736186 1157.7 504304754 3847.6 615112838 4693
library(tm)
dtm = txt %>%
iconv(to = "utf-8", sub="") %>%
VectorSource %>% Corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeWords, stopwords("english")) %>%
tm_map(stemDocument) %>%
DocumentTermMatrix %>%
removeSparseTerms(0.998)
dtm # (documents: 12729, terms: 3145)
<<DocumentTermMatrix (documents: 12729, terms: 2479)>>
Non-/sparse entries: 612069/30943122
Sparsity : 98%
Maximal term length: 13
Weighting : term frequency (tf)
TF-IDF
篩選字詞library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0380 0.0897 0.1100 0.1204 0.1353 0.5020
dtm = dtm[, tfidf > 0.0897 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)
[1] 12729 1859
tSNE
做尺度縮減library(Rtsne)
n = 1000
tsne = dtm[, 1:n] %>% as.data.frame.matrix %>%
scale %>% t %>%
Rtsne(check_dup=F, theta=0.0, max_iter=3000)
Y = tsne$Y # tSNE coordinates
d = dist(Y) # distance matrix
hc = hclust(d) # hi-clustering
K = 100 # number of clusters
g = cutree(hc,K) # cut into K clusters
table(g) %>% as.vector %>% sort # sizes of clusters
[1] 2 2 2 3 3 3 3 3 4 4 4 4 4 5 5 5 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7
[32] 7 7 7 8 8 8 8 8 8 8 8 9 9 9 9 9 9 9 10 10 10 10 10 10 10 10 10 10 10 10 11
[63] 11 11 12 12 12 12 12 12 12 13 13 13 13 14 14 14 14 14 14 15 15 15 15 15 15 15 16 17 17 17 17
[94] 17 18 18 19 20 23 24
wc = col_sums(dtm[,1:n])
sz = 0.15 + sqrt(wc/mean(wc))
range(sz)
[1] 0.6989 6.3995
library(randomcoloR)
library(wordcloud)
colors = distinctColorPalette(K)
png("fig/spas2.png", width=3200, height=1800)
textplot(
Y[,1], Y[,2], colnames(dtm)[1:n], show=F,
col=colors[g],
cex= sz,
font=2)
dev.off()
png
2
最後我們使用Shiny App
來檢視不同的「評論內容」與「讀者評價」之間的關係,並且比較兩者之間的關係會如何隨著「商業類別」而改變。利用這一個APP,我們跟大家示範在Shiny App
這個非常靈活的資料視覺化工具,它可以搭配R強大的文字分析能力,做出很多很有用、很有趣的應用。
點我啟動APP (ctrl + click)