Differences

This shows you the differences between two versions of the page.

--- text_mining_example_with_korean_songs [2016/12/07 10:19] – hkimscil
+++ text_mining_example_with_korean_songs [2017/12/14 10:12] (current) – [Twitter] hkimscil
@@ Line 9: / Line 9: @@
 library(wordcloud)
 library(XLConnect)
+library(twitteR)
 # set your data dir in which the save file is located.
@@ Line 85: / Line 86: @@
 hkimscil # I don't twitt much . . . .
 jaemyung_lee # He does a lot!
+see https://dev.twitter.com/apps/new
 <code>#get your own one
@@ Line 96: / Line 101: @@
 Goto https://apps.twitter.com/
+<code>
+api_key <- "TglWL7ysGLdwIP7g8CzTw"
+api_secret <- "7oWf4jfYBOV57GX2sFeBCIFv23sJNkm72lQ83GTnnAs"
+access_token <- "24853107-PnJgDNnZgoGR22ffvAiJFq2anqx84prSPlsRSV3te"
+access_secret <- "rkptQl92SusirGmGRX9Ch7WDhkzwU45LlhBJ2GSE"
+setup_twitter_oauth(api_key, api_secret, access_token, access_secret)
+library(twitteR)
+# retrieve the first 100 tweets (or all tweets if fewer than 100)
+# from the user timeline of @rdatammining
+rdmTweets <- userTimeline("rdatamining", n=100)
+n <- length(rdmTweets)
+rdmTweets[1:3]
+df <- do.call("rbind", lapply(rdmTweets, as.data.frame))
+dim(df)
+library(tm)
+# build a corpus, which is a collection of text documents
+# VectorSource specifies that the source is character vectors.
+myCorpus <- Corpus(VectorSource(df$text))
+myCorpus <- tm_map(myCorpus, tolower)
+# remove punctuation
+myCorpus <- tm_map(myCorpus, removePunctuation)
+# remove numbers
+myCorpus <- tm_map(myCorpus, removeNumbers)
+# remove stopwords
+# keep "r" by removing it from stopwords
+myStopwords <- c(stopwords('english'), "available", "via")
+idx <- which(myStopwords == "r")
+myStopwords <- myStopwords[-idx]
+myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
+# remove stopwords
+# keep "r" by removing it from stopwords
+myStopwords <- c(stopwords('english'), "available", "via")
+idx <- which(myStopwords == "r")
+myStopwords <- myStopwords[-idx]
+myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
+dictCorpus <- myCorpus
+# stem words in a text document with the snowball stemmers,
+# which requires packages Snowball, RWeka, rJava, RWekajars
+myCorpus <- tm_map(myCorpus, stemDocument)
+# inspect the first three ``documents"
+inspect(myCorpus[1:3])
+# stem completion
+myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
+inspect(myCorpus[1:3])
+myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
+</code>