text_mining_example_with_korean_songs
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
text_mining_example_with_korean_songs [2016/12/07 09:51] – hkimscil | text_mining_example_with_korean_songs [2017/12/14 10:12] (current) – [Twitter] hkimscil | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Lylics in Music ====== | ||
{{: | {{: | ||
Line 8: | Line 9: | ||
library(wordcloud) | library(wordcloud) | ||
library(XLConnect) | library(XLConnect) | ||
+ | library(twitteR) | ||
- | + | # set your data dir in which the save file is located. | |
setwd (" | setwd (" | ||
Line 17: | Line 18: | ||
music90s <- readWorksheetFromFile(music, | music90s <- readWorksheetFromFile(music, | ||
+ | # use VectorSource | ||
lyrics<- Corpus(VectorSource(music90s$lyrics)) | lyrics<- Corpus(VectorSource(music90s$lyrics)) | ||
result.text <- lyrics | result.text <- lyrics | ||
- | removeTwitSign <- function(x) { gsub(" | + | inspect(result.text[1: |
- | removeURL <- function(x) { gsub(" | + | |
- | removeEnter <- function(x) { gsub(" | + | # removeTwitSign <- function(x) { gsub(" |
- | exNouns <- function(x) { paste(extractNoun(x), | + | # removeURL <- function(x) { gsub(" |
+ | # removeEnter <- function(x) { gsub(" | ||
+ | # exNouns <- function(x) { paste(extractNoun(x), | ||
# NA -> "" | # NA -> "" | ||
Line 78: | Line 82: | ||
{{90s.jpg}} | {{90s.jpg}} | ||
+ | |||
+ | ====== Twitter ====== | ||
+ | hkimscil # I don't twitt much . . . . | ||
+ | jaemyung_lee # He does a lot! | ||
+ | |||
+ | see https:// | ||
+ | |||
+ | |||
+ | |||
+ | < | ||
+ | api_key <- " | ||
+ | api_secret <- " | ||
+ | access_token <- " | ||
+ | access_secret <- " | ||
+ | setup_twitter_oauth(api_key, | ||
+ | </ | ||
+ | |||
+ | Goto https:// | ||
+ | |||
+ | < | ||
+ | api_key <- " | ||
+ | api_secret <- " | ||
+ | access_token <- " | ||
+ | access_secret <- " | ||
+ | setup_twitter_oauth(api_key, | ||
+ | |||
+ | library(twitteR) | ||
+ | # retrieve the first 100 tweets (or all tweets if fewer than 100) | ||
+ | # from the user timeline of @rdatammining | ||
+ | rdmTweets <- userTimeline(" | ||
+ | n <- length(rdmTweets) | ||
+ | rdmTweets[1: | ||
+ | |||
+ | df <- do.call(" | ||
+ | dim(df) | ||
+ | |||
+ | library(tm) | ||
+ | # build a corpus, which is a collection of text documents | ||
+ | # VectorSource specifies that the source is character vectors. | ||
+ | myCorpus <- Corpus(VectorSource(df$text)) | ||
+ | |||
+ | myCorpus <- tm_map(myCorpus, | ||
+ | # remove punctuation | ||
+ | myCorpus <- tm_map(myCorpus, | ||
+ | # remove numbers | ||
+ | myCorpus <- tm_map(myCorpus, | ||
+ | # remove stopwords | ||
+ | # keep " | ||
+ | myStopwords <- c(stopwords(' | ||
+ | idx <- which(myStopwords == " | ||
+ | myStopwords <- myStopwords[-idx] | ||
+ | myCorpus <- tm_map(myCorpus, | ||
+ | |||
+ | # remove stopwords | ||
+ | # keep " | ||
+ | myStopwords <- c(stopwords(' | ||
+ | idx <- which(myStopwords == " | ||
+ | myStopwords <- myStopwords[-idx] | ||
+ | myCorpus <- tm_map(myCorpus, | ||
+ | |||
+ | |||
+ | dictCorpus <- myCorpus | ||
+ | # stem words in a text document with the snowball stemmers, | ||
+ | # which requires packages Snowball, RWeka, rJava, RWekajars | ||
+ | myCorpus <- tm_map(myCorpus, | ||
+ | # inspect the first three ``documents" | ||
+ | inspect(myCorpus[1: | ||
+ | |||
+ | # stem completion | ||
+ | myCorpus <- tm_map(myCorpus, | ||
+ | inspect(myCorpus[1: | ||
+ | |||
+ | myDtm <- TermDocumentMatrix(myCorpus, | ||
+ | </ | ||
+ | |||
+ |
text_mining_example_with_korean_songs.1481073669.txt.gz · Last modified: 2016/12/07 09:51 by hkimscil