COMMunication
RESearch.NET

This is an old revision of the document!
library(bitops)
library(RCurl)
library(rjson)
library(twitteR)
library(digest)
library(ROAuth)
library(KoNLP)
library(rJava)
library(tm)
library(wordcloud)
library(XLConnect)

setwd ("D:/Users/Hyo/Clouds/CS-DS/CS/MovieStudy")

rm(list=ls())
ani<- file.path("all_comb.xlsx")

anisheet <- readWorksheetFromFile(ani, sheet="imdb")
ani.text <- Corpus(VectorSource(anisheet$plotStory))
result.text <- ani.text


myCorpus <- Corpus(VectorSource(result.text))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, tolower)
myStopwords <- c(stopwords('english'), "rt")
myCorpus <-tm_map(myCorpus, removeWords, myStopwords)

inspect(myCorpus[1:5])

myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(2,Inf)))
mat <- as.data.frame(as.matrix(myTdm))
write.table(mat, file="myTdm-ani.txt", col.names=FALSE, row.names=TRUE,sep="\t")

pal <- brewer.pal(12,"Paired")
 
# 폰트 세팅. 띄어쓰기나 대소문자에 민감하다는 점에 주의
# 맑은고딕 : windowsFonts(malgun=windowsFont("맑은 고딕"))
# 나눔고딕 : windowsFonts(malgun=windowsFont("나눔고딕"))
windowsFonts(malgun=windowsFont("서울남산체 B"))
 
m <- as.matrix(myTdm)
# calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
k <- which(names(v)=="apple")
myNames[k] <- "apple"
d <- data.frame(word=myNames, freq=v)
#wordcloud(d$word, d$freq, scale=c(4,0.5), min.freq=3, random.order=F, rot.per=.1, family="malgun")
wordcloud(d$word, d$freq, scale=c(4,0.7), min.freq=2, random.order=F, rot.per=.1, colors=pal, family="malgun")