Differences

This shows you the differences between two versions of the page.

--- text_mining_example_with_korean_songs [2016/12/07 09:51] – hkimscil
+++ text_mining_example_with_korean_songs [2017/12/14 10:12] (current) – [Twitter] hkimscil
@@ Line 1: / Line 1: @@
+====== Lylics in Music ======
 {{:mm.xlsx}}
@@ Line 8: / Line 9: @@
 library(wordcloud)
 library(XLConnect)
+library(twitteR)
+# set your data dir in which the save file is located.
 setwd ("D:/Users/Hyo/Clouds/Cs-Ds/CS/MusicStudy")
@@ Line 17: / Line 18: @@
 music90s <- readWorksheetFromFile(music, sheet="1990s")
+# use VectorSource
 lyrics<- Corpus(VectorSource(music90s$lyrics))
 result.text <- lyrics
-removeTwitSign <- function(x) { gsub("@[[:graph:]]*","",x) } # may not be used in this case
+inspect(result.text[1:5])
-removeURL <- function(x) { gsub("http://[[:graph:]]*","",x) } # may not be used in this case
-removeEnter <- function(x) { gsub("\n","",x) }
+# removeTwitSign <- function(x) { gsub("@[[:graph:]]*","",x) } # may not be used in this case
-exNouns <- function(x) { paste(extractNoun(x), collapse=" ")}
+# removeURL <- function(x) { gsub("http://[[:graph:]]*","",x) } # may not be used in this case
+# removeEnter <- function(x) { gsub("\n","",x) }
+# exNouns <- function(x) { paste(extractNoun(x), collapse=" ")}
 # NA -> "" 로 변환
@@ Line 78: / Line 82: @@
 {{90s.jpg}}
+====== Twitter ======
+hkimscil # I don't twitt much . . . .
+jaemyung_lee # He does a lot!
+see https://dev.twitter.com/apps/new
+<code>#get your own one
+api_key <- "xxxx"
+api_secret <- "xxxx"
+access_token <- "xxxx"
+access_secret <- "xxxx"
+setup_twitter_oauth(api_key, api_secret, access_token, access_secret)
+</code>
+Goto https://apps.twitter.com/
+<code>
+api_key <- "TglWL7ysGLdwIP7g8CzTw"
+api_secret <- "7oWf4jfYBOV57GX2sFeBCIFv23sJNkm72lQ83GTnnAs"
+access_token <- "24853107-PnJgDNnZgoGR22ffvAiJFq2anqx84prSPlsRSV3te"
+access_secret <- "rkptQl92SusirGmGRX9Ch7WDhkzwU45LlhBJ2GSE"
+setup_twitter_oauth(api_key, api_secret, access_token, access_secret)
+library(twitteR)
+# retrieve the first 100 tweets (or all tweets if fewer than 100)
+# from the user timeline of @rdatammining
+rdmTweets <- userTimeline("rdatamining", n=100)
+n <- length(rdmTweets)
+rdmTweets[1:3]
+df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
+dim(df)
+library(tm)
+# build a corpus, which is a collection of text documents
+# VectorSource specifies that the source is character vectors.
+myCorpus <- Corpus(VectorSource(df$text))
+myCorpus <- tm_map(myCorpus, tolower)
+# remove punctuation
+myCorpus <- tm_map(myCorpus, removePunctuation)
+# remove numbers
+myCorpus <- tm_map(myCorpus, removeNumbers)
+# remove stopwords
+# keep "r" by removing it from stopwords
+myStopwords <- c(stopwords('english'), "available", "via")
+idx <- which(myStopwords == "r")
+myStopwords <- myStopwords[-idx]
+myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
+# remove stopwords
+# keep "r" by removing it from stopwords
+myStopwords <- c(stopwords('english'), "available", "via")
+idx <- which(myStopwords == "r")
+myStopwords <- myStopwords[-idx]
+myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
+dictCorpus <- myCorpus
+# stem words in a text document with the snowball stemmers,
+# which requires packages Snowball, RWeka, rJava, RWekajars
+myCorpus <- tm_map(myCorpus, stemDocument)
+# inspect the first three ``documents"
+inspect(myCorpus[1:3])
+# stem completion
+myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
+inspect(myCorpus[1:3])
+myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
+</code>