text_mining_example_with_korean_songs
Lylics in Music
library(bitops) library(RCurl) library(KoNLP) library(rJava) library(tm) library(wordcloud) library(XLConnect) library(twitteR) # set your data dir in which the save file is located. setwd ("D:/Users/Hyo/Clouds/Cs-Ds/CS/MusicStudy") rm(list=ls()) music<- file.path("mm.xlsx") music90s <- readWorksheetFromFile(music, sheet="1990s") # use VectorSource lyrics<- Corpus(VectorSource(music90s$lyrics)) result.text <- lyrics inspect(result.text[1:5]) # removeTwitSign <- function(x) { gsub("@[[:graph:]]*","",x) } # may not be used in this case # removeURL <- function(x) { gsub("http://[[:graph:]]*","",x) } # may not be used in this case # removeEnter <- function(x) { gsub("\n","",x) } # exNouns <- function(x) { paste(extractNoun(x), collapse=" ")} # NA -> "" 로 변환 result.text[is.na(result.text)] <- "" useSejongDic() mergeUserDic(data.frame(c( "개여울", "고운정", "못잊어", "내마음", "가시내", "꽃가마", "내곁", "윙크", "못잊어서" ), c("ncn"))) result_nouns <- sapply(result.text, removeEnter) result_nouns <- sapply(result.text, exNouns) myCorpus <- Corpus(VectorSource(result_nouns)) myCorpus <- tm_map(myCorpus, removePunctuation) myCorpus <- tm_map(myCorpus, removeNumbers) myCorpus <- tm_map(myCorpus, tolower) myStopwords <- c(stopwords('english'), "rt") myCorpus <-tm_map(myCorpus, removeWords, myStopwords) inspect(myCorpus[1:5]) myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf))) mat <- as.data.frame(as.matrix(myTdm)) write.table(mat, file="_lyrics_90s1.txt", col.names=FALSE, row.names=TRUE,sep="\t") myTdm2 <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(2,Inf))) mat2 <- as.data.frame(as.matrix(myTdm2)) write.table(mat2, file="_lyrics_90s2.txt", col.names=FALSE, row.names=TRUE,sep="\t") par(mfrow = c(1,2)) pal <- brewer.pal(8,"Dark2") # 폰트 세팅. 띄어쓰기나 대소문자에 민감하다는 점에 주의 # 맑은고딕 : windowsFonts(malgun=windowsFont("맑은 고딕")) # 나눔고딕 : windowsFonts(malgun=windowsFont("나눔고딕")) windowsFonts(malgun=windowsFont("맑은 고딕")) m <- as.matrix(myTdm) v <- sort(rowSums(m), decreasing=TRUE) myNames <- names(v) d <- data.frame(word=myNames, freq=v) wordcloud(d$word, d$freq, scale=c(7,0.8), min.freq=3, random.order=F, rot.per=.1, colors=pal, family="malgun") m2 <- as.matrix(myTdm2) v2 <- sort(rowSums(m2), decreasing=TRUE) myNames2 <- names(v2) d2 <- data.frame(word=myNames2, freq=v2) wordcloud(d2$word, d2$freq, scale=c(7,0.8), min.freq=3, random.order=F, rot.per=.1, colors=pal, family="malgun")
hkimscil # I don't twitt much . . . .
jaemyung_lee # He does a lot!
see https://dev.twitter.com/apps/new
#get your own one api_key <- "xxxx" api_secret <- "xxxx" access_token <- "xxxx" access_secret <- "xxxx" setup_twitter_oauth(api_key, api_secret, access_token, access_secret)
Goto https://apps.twitter.com/
api_key <- "TglWL7ysGLdwIP7g8CzTw" api_secret <- "7oWf4jfYBOV57GX2sFeBCIFv23sJNkm72lQ83GTnnAs" access_token <- "24853107-PnJgDNnZgoGR22ffvAiJFq2anqx84prSPlsRSV3te" access_secret <- "rkptQl92SusirGmGRX9Ch7WDhkzwU45LlhBJ2GSE" setup_twitter_oauth(api_key, api_secret, access_token, access_secret) library(twitteR) # retrieve the first 100 tweets (or all tweets if fewer than 100) # from the user timeline of @rdatammining rdmTweets <- userTimeline("rdatamining", n=100) n <- length(rdmTweets) rdmTweets[1:3] df <- do.call("rbind", lapply(rdmTweets, as.data.frame)) dim(df) library(tm) # build a corpus, which is a collection of text documents # VectorSource specifies that the source is character vectors. myCorpus <- Corpus(VectorSource(df$text)) myCorpus <- tm_map(myCorpus, tolower) # remove punctuation myCorpus <- tm_map(myCorpus, removePunctuation) # remove numbers myCorpus <- tm_map(myCorpus, removeNumbers) # remove stopwords # keep "r" by removing it from stopwords myStopwords <- c(stopwords('english'), "available", "via") idx <- which(myStopwords == "r") myStopwords <- myStopwords[-idx] myCorpus <- tm_map(myCorpus, removeWords, myStopwords) # remove stopwords # keep "r" by removing it from stopwords myStopwords <- c(stopwords('english'), "available", "via") idx <- which(myStopwords == "r") myStopwords <- myStopwords[-idx] myCorpus <- tm_map(myCorpus, removeWords, myStopwords) dictCorpus <- myCorpus # stem words in a text document with the snowball stemmers, # which requires packages Snowball, RWeka, rJava, RWekajars myCorpus <- tm_map(myCorpus, stemDocument) # inspect the first three ``documents" inspect(myCorpus[1:3]) # stem completion myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) inspect(myCorpus[1:3]) myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
text_mining_example_with_korean_songs.txt · Last modified: 2017/12/14 10:12 by hkimscil