User Tools

Site Tools


text_mining_example_with_korean_songs

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
text_mining_example_with_korean_songs [2016/12/07 07:47] hkimsciltext_mining_example_with_korean_songs [2017/12/14 10:12] (current) – [Twitter] hkimscil
Line 1: Line 1:
 +====== Lylics in Music ======
 +{{:mm.xlsx}}
 +
 <code>library(bitops) <code>library(bitops)
 library(RCurl) library(RCurl)
Line 6: Line 9:
 library(wordcloud) library(wordcloud)
 library(XLConnect) library(XLConnect)
 +library(twitteR)
  
- +# set your data dir in which the save file is located.
 setwd ("D:/Users/Hyo/Clouds/Cs-Ds/CS/MusicStudy") setwd ("D:/Users/Hyo/Clouds/Cs-Ds/CS/MusicStudy")
  
Line 15: Line 18:
  
 music90s <- readWorksheetFromFile(music, sheet="1990s") music90s <- readWorksheetFromFile(music, sheet="1990s")
 +# use VectorSource 
 lyrics<- Corpus(VectorSource(music90s$lyrics)) lyrics<- Corpus(VectorSource(music90s$lyrics))
 result.text <- lyrics result.text <- lyrics
  
-result.text <- sub("https://(?:www[.])?([^/]*).*$", "", result.text) +inspect(result.text[1:5])
-result.text <- sub("http://(?:www[.])?([^/]*).*$", "", result.text) +
-result.text <- gsub("\n", " ", result.text) +
-result.text <- gsub("\r", " ", result.text) +
-result.text <- gsub("RT", " ", result.text) +
-result.text <- gsub("CO", " ", result.text) +
-result.text <- gsub("co", " ", result.text) +
-result.text <- gsub("ㅋㅋ", " ", result.text) +
-result.text <- gsub("ㅋㅋㅋ", " ", result.text) +
-result.text <- gsub("ㅋㅋㅋㅋ", " ", result.text) +
-result.text <- gsub("ㅠㅠ", " ", result.text) +
-result.text <- gsub("\\^", " ", result.text) +
-result.text <- gsub("\\ː", " ", result.text) +
-result.text <- gsub("/", " ", result.text) +
-result.text <- gsub("#", " ", result.text) +
-result.text <- gsub("□", " ", result.text) +
-result.text <- gsub("↔", " ", result.text) +
-result.text <- gsub(" ̄ ̄", "", result.text) +
-result.text <- gsub("◆", "", result.text) +
-result.text <- gsub("→", "", result.text) +
-result.text <- gsub("↑", "", result.text) +
-result.text <- gsub("\\?", " ", result.text) +
-result.text <- gsub("\\&", " ", result.text) +
-result.text <- gsub("/", " ", result.text) +
-result.text <- gsub("\\\\", " ", result.text)+
  
-result.text <- gsub("▽", " ", result.text) +removeTwitSign <- function(x) { gsub("@[[:graph:]]*","",x) } # may not be used in this case 
-result.text <- gsub("▷", " ", result.text) +removeURL <- function(x) { gsub("http://[[:graph:]]*","",x) } # may not be used in this case 
-result.text <- gsub("♤", " ", result.text) +removeEnter <- function(x) { gsub("\n","",x) }  
-result.text <- gsub("●", " ", result.text) +exNouns <- function(x) { paste(extractNoun(x), collapse=" ")}
-result.text <- gsub("▶", " ", result.text) +
-result.text <- gsub("♡", " ", result.text) +
-result.text <- gsub("▲", " ", result.text) +
-result.text <- gsub("■", " ", result.text) +
-result.text <- gsub("▼", " ", result.text) +
-result.text <- gsub("★", " ", result.text) +
-result.text <- gsub("☆", " ", result.text) +
-result.text <- gsub("♥", " ", result.text) +
-result.text <- gsub("♣", " ", result.text) +
-result.text <- gsub("♪", " ", result.text) +
-result.text <- gsub("≠", " ", result.text) +
-result.text <- gsub("∴", " ", result.text) +
-result.text <- gsub("◀", " ", result.text) +
-result.text <- gsub("☜", " ", result.text) +
-result.text <- gsub("♧", " ", result.text) +
-result.text <- gsub("♨", " ", result.text) +
-result.text <- gsub("♬", " ", result.text) +
-result.text <- gsub("①", " ", result.text) +
-result.text <- gsub("②", " ", result.text) +
-result.text <- gsub("③", " ", result.text) +
-result.text <- gsub("⊙", " ", result.text) +
-result.text <- gsub("☞", " ", result.text)  +
- +
-result.text <- gsub(",", ", ", result.text) +
-result.text <- gsub("\\.", "\\. ", result.text) +
-result.text <- gsub("\\+", " ", result.text) +
-result.text <- gsub("\\-", " ", result.text) +
-result.text <- gsub("\\:", " ", result.text) +
- +
-result.text <- gsub("\\(", " ", result.text) +
-result.text <- gsub("\\)", " ", result.text) +
- +
-result.text <- gsub(" \n", "\n", result.text) +
-result.text <- gsub("=", " ", result.text) +
-result.text <- gsub("~", " ", result.text) +
-result.text <- gsub("^_^", " ", result.text) +
-result.text <- gsub("^ ", "", result.text) +
-result.text <- gsub(" $", "", result.text) +
- +
-removeTwitSign <- function(x) { gsub("@[[:graph:]]*","",x) } +
-removeURL <- function(x) { gsub("http://[[:graph:]]*","",x) } +
-removeEnter <- function(x) { gsub("\n","",x) } +
-exNouns <- function(x) { paste(extractNoun(x), collapse=" ")}+
  
 # NA -> "" 로 변환 # NA -> "" 로 변환
 result.text[is.na(result.text)]   <- "" result.text[is.na(result.text)]   <- ""
-result.text <- gsub("[[:punct:]]", "", result.text) 
-result.text <- gsub(" $", "", result.text) 
  
 useSejongDic() useSejongDic()
Line 103: Line 38:
 ), c("ncn"))) ), c("ncn")))
  
 +result_nouns <- sapply(result.text, removeEnter)
 result_nouns <- sapply(result.text, exNouns) result_nouns <- sapply(result.text, exNouns)
  
Line 146: Line 82:
  
 {{90s.jpg}} {{90s.jpg}}
 +
 +====== Twitter ======
 +hkimscil # I don't twitt much . . . .
 +jaemyung_lee # He does a lot!
 +
 +see https://dev.twitter.com/apps/new
 +
 +
 +
 +<code>#get your own one 
 +api_key <- "xxxx"
 +api_secret <- "xxxx"
 +access_token <- "xxxx"
 +access_secret <- "xxxx"
 +setup_twitter_oauth(api_key, api_secret, access_token, access_secret)
 +</code>
 +
 +Goto https://apps.twitter.com/
 +
 +<code>
 +api_key <- "TglWL7ysGLdwIP7g8CzTw"
 +api_secret <- "7oWf4jfYBOV57GX2sFeBCIFv23sJNkm72lQ83GTnnAs"
 +access_token <- "24853107-PnJgDNnZgoGR22ffvAiJFq2anqx84prSPlsRSV3te"
 +access_secret <- "rkptQl92SusirGmGRX9Ch7WDhkzwU45LlhBJ2GSE"
 +setup_twitter_oauth(api_key, api_secret, access_token, access_secret)
 +
 +library(twitteR)
 +# retrieve the first 100 tweets (or all tweets if fewer than 100)
 +# from the user timeline of @rdatammining
 +rdmTweets <- userTimeline("rdatamining", n=100)
 +n <- length(rdmTweets)
 +rdmTweets[1:3]
 +
 +df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
 +dim(df)
 +
 +library(tm)
 +# build a corpus, which is a collection of text documents
 +# VectorSource specifies that the source is character vectors.
 +myCorpus <- Corpus(VectorSource(df$text))
 +
 +myCorpus <- tm_map(myCorpus, tolower)
 +# remove punctuation
 +myCorpus <- tm_map(myCorpus, removePunctuation)
 +# remove numbers
 +myCorpus <- tm_map(myCorpus, removeNumbers)
 +# remove stopwords
 +# keep "r" by removing it from stopwords
 +myStopwords <- c(stopwords('english'), "available", "via")
 +idx <- which(myStopwords == "r")
 +myStopwords <- myStopwords[-idx]
 +myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
 +
 +# remove stopwords
 +# keep "r" by removing it from stopwords
 +myStopwords <- c(stopwords('english'), "available", "via")
 +idx <- which(myStopwords == "r")
 +myStopwords <- myStopwords[-idx]
 +myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
 +
 +
 +dictCorpus <- myCorpus
 +# stem words in a text document with the snowball stemmers,
 +# which requires packages Snowball, RWeka, rJava, RWekajars
 +myCorpus <- tm_map(myCorpus, stemDocument)
 +# inspect the first three ``documents"
 +inspect(myCorpus[1:3])
 +
 +# stem completion
 +myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
 +inspect(myCorpus[1:3])
 +
 +myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
 +</code>
 +
 +
text_mining_example_with_korean_songs.1481066236.txt.gz · Last modified: 2016/12/07 07:47 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki