text_mining_example_with_korean_songs
Differences
This shows you the differences between two versions of the page.
| Next revision | Previous revision | ||
| text_mining_example_with_korean_songs [2016/12/05 09:48] – created hkimscil | text_mining_example_with_korean_songs [2017/12/14 10:12] (current) – [Twitter] hkimscil | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | ====== Lylics in Music ====== | ||
| + | {{: | ||
| + | |||
| < | < | ||
| library(RCurl) | library(RCurl) | ||
| Line 6: | Line 9: | ||
| library(wordcloud) | library(wordcloud) | ||
| library(XLConnect) | library(XLConnect) | ||
| + | library(twitteR) | ||
| - | + | # set your data dir in which the save file is located. | |
| setwd (" | setwd (" | ||
| Line 15: | Line 18: | ||
| music90s <- readWorksheetFromFile(music, | music90s <- readWorksheetFromFile(music, | ||
| + | # use VectorSource | ||
| lyrics<- Corpus(VectorSource(music90s$lyrics)) | lyrics<- Corpus(VectorSource(music90s$lyrics)) | ||
| result.text <- lyrics | result.text <- lyrics | ||
| - | result.text | + | inspect(result.text[1:5]) |
| - | result.text <- sub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub("/", | + | |
| - | result.text <- gsub("#", | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub("/", | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | # removeTwitSign <- function(x) { gsub(" |
| - | result.text <- gsub(" | + | # removeURL <- function(x) { gsub(" |
| - | result.text <- gsub(" | + | # removeEnter <- function(x) { gsub(" |
| - | result.text <- gsub(" | + | # exNouns <- function(x) { paste(extractNoun(x), |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | + | ||
| - | result.text <- gsub(",", | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | + | ||
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | + | ||
| - | result.text <- gsub(" \n", " | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" | + | |
| - | result.text <- gsub(" $", "", | + | |
| - | + | ||
| - | removeTwitSign <- function(x) { gsub(" | + | |
| - | removeURL <- function(x) { gsub(" | + | |
| - | removeEnter <- function(x) { gsub(" | + | |
| - | exNouns <- function(x) { paste(extractNoun(x), | + | |
| # NA -> "" | # NA -> "" | ||
| result.text[is.na(result.text)] | result.text[is.na(result.text)] | ||
| - | result.text <- gsub(" | ||
| - | result.text <- gsub(" $", "", | ||
| useSejongDic() | useSejongDic() | ||
| Line 103: | Line 38: | ||
| ), c(" | ), c(" | ||
| + | result_nouns <- sapply(result.text, | ||
| result_nouns <- sapply(result.text, | result_nouns <- sapply(result.text, | ||
| Line 116: | Line 52: | ||
| myTdm <- TermDocumentMatrix(myCorpus, | myTdm <- TermDocumentMatrix(myCorpus, | ||
| mat <- as.data.frame(as.matrix(myTdm)) | mat <- as.data.frame(as.matrix(myTdm)) | ||
| - | write.table(mat, | + | write.table(mat, |
| + | |||
| + | myTdm2 <- TermDocumentMatrix(myCorpus, | ||
| + | mat2 <- as.data.frame(as.matrix(myTdm2)) | ||
| + | write.table(mat2, | ||
| + | |||
| + | par(mfrow = c(1,2)) | ||
| pal <- brewer.pal(8," | pal <- brewer.pal(8," | ||
| Line 126: | Line 68: | ||
| m <- as.matrix(myTdm) | m <- as.matrix(myTdm) | ||
| - | # calculate the frequency of words | ||
| v <- sort(rowSums(m), | v <- sort(rowSums(m), | ||
| myNames <- names(v) | myNames <- names(v) | ||
| - | k <- which(names(v)==" | ||
| - | myNames[k] <- " | ||
| d <- data.frame(word=myNames, | d <- data.frame(word=myNames, | ||
| - | #wordcloud(d$word, | + | wordcloud(d$word, |
| - | wordcloud(d$word, | + | |
| + | m2 <- as.matrix(myTdm2) | ||
| + | v2 <- sort(rowSums(m2), | ||
| + | myNames2 <- names(v2) | ||
| + | d2 <- data.frame(word=myNames2, | ||
| + | wordcloud(d2$word, | ||
| </ | </ | ||
| + | |||
| + | {{90s.jpg}} | ||
| + | |||
| + | ====== Twitter ====== | ||
| + | hkimscil # I don't twitt much . . . . | ||
| + | jaemyung_lee # He does a lot! | ||
| + | |||
| + | see https:// | ||
| + | |||
| + | |||
| + | |||
| + | < | ||
| + | api_key <- " | ||
| + | api_secret <- " | ||
| + | access_token <- " | ||
| + | access_secret <- " | ||
| + | setup_twitter_oauth(api_key, | ||
| + | </ | ||
| + | |||
| + | Goto https:// | ||
| + | |||
| + | < | ||
| + | api_key <- " | ||
| + | api_secret <- " | ||
| + | access_token <- " | ||
| + | access_secret <- " | ||
| + | setup_twitter_oauth(api_key, | ||
| + | |||
| + | library(twitteR) | ||
| + | # retrieve the first 100 tweets (or all tweets if fewer than 100) | ||
| + | # from the user timeline of @rdatammining | ||
| + | rdmTweets <- userTimeline(" | ||
| + | n <- length(rdmTweets) | ||
| + | rdmTweets[1: | ||
| + | |||
| + | df <- do.call(" | ||
| + | dim(df) | ||
| + | |||
| + | library(tm) | ||
| + | # build a corpus, which is a collection of text documents | ||
| + | # VectorSource specifies that the source is character vectors. | ||
| + | myCorpus <- Corpus(VectorSource(df$text)) | ||
| + | |||
| + | myCorpus <- tm_map(myCorpus, | ||
| + | # remove punctuation | ||
| + | myCorpus <- tm_map(myCorpus, | ||
| + | # remove numbers | ||
| + | myCorpus <- tm_map(myCorpus, | ||
| + | # remove stopwords | ||
| + | # keep " | ||
| + | myStopwords <- c(stopwords(' | ||
| + | idx <- which(myStopwords == " | ||
| + | myStopwords <- myStopwords[-idx] | ||
| + | myCorpus <- tm_map(myCorpus, | ||
| + | |||
| + | # remove stopwords | ||
| + | # keep " | ||
| + | myStopwords <- c(stopwords(' | ||
| + | idx <- which(myStopwords == " | ||
| + | myStopwords <- myStopwords[-idx] | ||
| + | myCorpus <- tm_map(myCorpus, | ||
| + | |||
| + | |||
| + | dictCorpus <- myCorpus | ||
| + | # stem words in a text document with the snowball stemmers, | ||
| + | # which requires packages Snowball, RWeka, rJava, RWekajars | ||
| + | myCorpus <- tm_map(myCorpus, | ||
| + | # inspect the first three ``documents" | ||
| + | inspect(myCorpus[1: | ||
| + | |||
| + | # stem completion | ||
| + | myCorpus <- tm_map(myCorpus, | ||
| + | inspect(myCorpus[1: | ||
| + | |||
| + | myDtm <- TermDocumentMatrix(myCorpus, | ||
| + | </ | ||
| + | |||
| + | |||
text_mining_example_with_korean_songs.1480900722.txt.gz · Last modified: by hkimscil
