User Tools

Site Tools


text_mining_example_with_korean_songs

Lylics in Music

mm.xlsx

library(bitops)
library(RCurl)
library(KoNLP)
library(rJava)
library(tm)
library(wordcloud)
library(XLConnect)
library(twitteR)

# set your data dir in which the save file is located.
setwd ("D:/Users/Hyo/Clouds/Cs-Ds/CS/MusicStudy")

rm(list=ls())
music<- file.path("mm.xlsx")

music90s <- readWorksheetFromFile(music, sheet="1990s")
# use VectorSource 
lyrics<- Corpus(VectorSource(music90s$lyrics))
result.text <- lyrics

inspect(result.text[1:5])

# removeTwitSign <- function(x) { gsub("@[[:graph:]]*","",x) } # may not be used in this case
# removeURL <- function(x) { gsub("http://[[:graph:]]*","",x) } # may not be used in this case
# removeEnter <- function(x) { gsub("\n","",x) } 
# exNouns <- function(x) { paste(extractNoun(x), collapse=" ")}

# NA -> "" 로 변환
result.text[is.na(result.text)]   <- ""

useSejongDic()
mergeUserDic(data.frame(c(
"개여울", "고운정", "못잊어", "내마음", "가시내", "꽃가마", "내곁", "윙크",
"못잊어서"
), c("ncn")))

result_nouns <- sapply(result.text, removeEnter)
result_nouns <- sapply(result.text, exNouns)

myCorpus <- Corpus(VectorSource(result_nouns))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, tolower)
myStopwords <- c(stopwords('english'), "rt")
myCorpus <-tm_map(myCorpus, removeWords, myStopwords)

inspect(myCorpus[1:5])

myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf)))
mat <- as.data.frame(as.matrix(myTdm))
write.table(mat, file="_lyrics_90s1.txt", col.names=FALSE, row.names=TRUE,sep="\t")

myTdm2 <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(2,Inf)))
mat2 <- as.data.frame(as.matrix(myTdm2))
write.table(mat2, file="_lyrics_90s2.txt", col.names=FALSE, row.names=TRUE,sep="\t")

par(mfrow = c(1,2))

pal <- brewer.pal(8,"Dark2")
 
# 폰트 세팅. 띄어쓰기나 대소문자에 민감하다는 점에 주의
# 맑은고딕 : windowsFonts(malgun=windowsFont("맑은 고딕"))
# 나눔고딕 : windowsFonts(malgun=windowsFont("나눔고딕"))
windowsFonts(malgun=windowsFont("맑은 고딕"))
 
m <- as.matrix(myTdm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, scale=c(7,0.8), min.freq=3, random.order=F, rot.per=.1, colors=pal, family="malgun")

m2 <- as.matrix(myTdm2)
v2 <- sort(rowSums(m2), decreasing=TRUE)
myNames2 <- names(v2)
d2 <- data.frame(word=myNames2, freq=v2)
wordcloud(d2$word, d2$freq, scale=c(7,0.8), min.freq=3, random.order=F, rot.per=.1, colors=pal, family="malgun")

90s.jpg

Twitter

hkimscil # I don't twitt much . . . .
jaemyung_lee # He does a lot!

see https://dev.twitter.com/apps/new

#get your own one 
api_key <- "xxxx"
api_secret <- "xxxx"
access_token <- "xxxx"
access_secret <- "xxxx"
setup_twitter_oauth(api_key, api_secret, access_token, access_secret)

Goto https://apps.twitter.com/

api_key <- "TglWL7ysGLdwIP7g8CzTw"
api_secret <- "7oWf4jfYBOV57GX2sFeBCIFv23sJNkm72lQ83GTnnAs"
access_token <- "24853107-PnJgDNnZgoGR22ffvAiJFq2anqx84prSPlsRSV3te"
access_secret <- "rkptQl92SusirGmGRX9Ch7WDhkzwU45LlhBJ2GSE"
setup_twitter_oauth(api_key, api_secret, access_token, access_secret)

library(twitteR)
# retrieve the first 100 tweets (or all tweets if fewer than 100)
# from the user timeline of @rdatammining
rdmTweets <- userTimeline("rdatamining", n=100)
n <- length(rdmTweets)
rdmTweets[1:3]

df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
dim(df)

library(tm)
# build a corpus, which is a collection of text documents
# VectorSource specifies that the source is character vectors.
myCorpus <- Corpus(VectorSource(df$text))

myCorpus <- tm_map(myCorpus, tolower)
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
# remove stopwords
# keep "r" by removing it from stopwords
myStopwords <- c(stopwords('english'), "available", "via")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)

# remove stopwords
# keep "r" by removing it from stopwords
myStopwords <- c(stopwords('english'), "available", "via")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)


dictCorpus <- myCorpus
# stem words in a text document with the snowball stemmers,
# which requires packages Snowball, RWeka, rJava, RWekajars
myCorpus <- tm_map(myCorpus, stemDocument)
# inspect the first three ``documents"
inspect(myCorpus[1:3])

# stem completion
myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
inspect(myCorpus[1:3])

myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
text_mining_example_with_korean_songs.txt · Last modified: 2017/12/14 10:12 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki