User Tools

Site Tools


text_mining_example_with_korean_songs

This is an old revision of the document!


library(bitops)
library(RCurl)
library(KoNLP)
library(rJava)
library(tm)
library(wordcloud)
library(XLConnect)



setwd ("D:/Users/Hyo/Clouds/Cs-Ds/CS/MusicStudy")

rm(list=ls())
music<- file.path("mm.xlsx")

music90s <- readWorksheetFromFile(music, sheet="1990s")
lyrics<- Corpus(VectorSource(music90s$lyrics))
result.text <- lyrics

result.text <- sub("https://(?:www[.])?([^/]*).*$", "", result.text)
result.text <- sub("http://(?:www[.])?([^/]*).*$", "", result.text)
result.text <- gsub("\n", " ", result.text)
result.text <- gsub("\r", " ", result.text)
result.text <- gsub("RT", " ", result.text)
result.text <- gsub("CO", " ", result.text)
result.text <- gsub("co", " ", result.text)
result.text <- gsub("ㅋㅋ", " ", result.text)
result.text <- gsub("ㅋㅋㅋ", " ", result.text)
result.text <- gsub("ㅋㅋㅋㅋ", " ", result.text)
result.text <- gsub("ㅠㅠ", " ", result.text)
result.text <- gsub("\\^", " ", result.text)
result.text <- gsub("\\ː", " ", result.text)
result.text <- gsub("/", " ", result.text)
result.text <- gsub("#", " ", result.text)
result.text <- gsub("□", " ", result.text)
result.text <- gsub("↔", " ", result.text)
result.text <- gsub(" ̄ ̄", "", result.text)
result.text <- gsub("◆", "", result.text)
result.text <- gsub("→", "", result.text)
result.text <- gsub("↑", "", result.text)
result.text <- gsub("\\?", " ", result.text)
result.text <- gsub("\\&", " ", result.text)
result.text <- gsub("/", " ", result.text)
result.text <- gsub("\\\\", " ", result.text)

result.text <- gsub("▽", " ", result.text)
result.text <- gsub("▷", " ", result.text)
result.text <- gsub("♤", " ", result.text)
result.text <- gsub("●", " ", result.text)
result.text <- gsub("▶", " ", result.text)
result.text <- gsub("♡", " ", result.text)
result.text <- gsub("▲", " ", result.text)
result.text <- gsub("■", " ", result.text)
result.text <- gsub("▼", " ", result.text)
result.text <- gsub("★", " ", result.text)
result.text <- gsub("☆", " ", result.text)
result.text <- gsub("♥", " ", result.text)
result.text <- gsub("♣", " ", result.text)
result.text <- gsub("♪", " ", result.text)
result.text <- gsub("≠", " ", result.text)
result.text <- gsub("∴", " ", result.text)
result.text <- gsub("◀", " ", result.text)
result.text <- gsub("☜", " ", result.text)
result.text <- gsub("♧", " ", result.text)
result.text <- gsub("♨", " ", result.text)
result.text <- gsub("♬", " ", result.text)
result.text <- gsub("①", " ", result.text)
result.text <- gsub("②", " ", result.text)
result.text <- gsub("③", " ", result.text)
result.text <- gsub("⊙", " ", result.text)
result.text <- gsub("☞", " ", result.text) 

result.text <- gsub(",", ", ", result.text)
result.text <- gsub("\\.", "\\. ", result.text)
result.text <- gsub("\\+", " ", result.text)
result.text <- gsub("\\-", " ", result.text)
result.text <- gsub("\\:", " ", result.text)

result.text <- gsub("\\(", " ", result.text)
result.text <- gsub("\\)", " ", result.text)

result.text <- gsub(" \n", "\n", result.text)
result.text <- gsub("=", " ", result.text)
result.text <- gsub("~", " ", result.text)
result.text <- gsub("^_^", " ", result.text)
result.text <- gsub("^ ", "", result.text)
result.text <- gsub(" $", "", result.text)

removeTwitSign <- function(x) { gsub("@[[:graph:]]*","",x) }
removeURL <- function(x) { gsub("http://[[:graph:]]*","",x) }
removeEnter <- function(x) { gsub("\n","",x) }
exNouns <- function(x) { paste(extractNoun(x), collapse=" ")}

# NA -> "" 로 변환
result.text[is.na(result.text)]   <- ""
result.text <- gsub("[[:punct:]]", "", result.text)
result.text <- gsub(" $", "", result.text)

useSejongDic()
mergeUserDic(data.frame(c(
"개여울", "고운정", "못잊어", "내마음", "가시내", "꽃가마", "내곁", "윙크",
"못잊어서"
), c("ncn")))

result_nouns <- sapply(result.text, exNouns)

myCorpus <- Corpus(VectorSource(result_nouns))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, tolower)
myStopwords <- c(stopwords('english'), "rt")
myCorpus <-tm_map(myCorpus, removeWords, myStopwords)

inspect(myCorpus[1:5])

myTdm <- TermDocumentMatrix(myCorpus, control=list(wordLengths=c(1,Inf)))
mat <- as.data.frame(as.matrix(myTdm))
write.table(mat, file="_lyrics_90s.txt", col.names=FALSE, row.names=TRUE,sep="\t")

pal <- brewer.pal(8,"Dark2")
 
# 폰트 세팅. 띄어쓰기나 대소문자에 민감하다는 점에 주의
# 맑은고딕 : windowsFonts(malgun=windowsFont("맑은 고딕"))
# 나눔고딕 : windowsFonts(malgun=windowsFont("나눔고딕"))
windowsFonts(malgun=windowsFont("맑은 고딕"))
 
m <- as.matrix(myTdm)
# calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
k <- which(names(v)=="apple")
myNames[k] <- "apple"
d <- data.frame(word=myNames, freq=v)
#wordcloud(d$word, d$freq, scale=c(4,0.5), min.freq=3, random.order=F, rot.per=.1, family="malgun")
wordcloud(d$word, d$freq, scale=c(7,0.8), min.freq=2, random.order=F, rot.per=.1, colors=pal, family="malgun")
text_mining_example_with_korean_songs.1480900722.txt.gz · Last modified: 2016/12/05 09:48 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki