Here we will scrape recent tweets from Donald Trump’s Twitter account and plot them in a word cloud
The first step is to install and load the ‘rvest’ package
library(rvest)
## Loading required package: xml2
Note: A nifty tool that you should also install is called “selector gadget” (http://selectorgadget.com/). It will help you discover the relevant CSS parts easily
First, specify the url for the website you want to scrape:
url <- 'https://twitter.com/realDonaldTrump'
Read the HTML code from the website
webpage <- read_html(url)
Now we we’d like to scrape the tweet’s text. A click on the first name using “selector gadget” tells us that this field is called .tweet-text
tweets_data_html <- html_nodes(webpage, '.tweet-text')
Now we convert the data to text
tweets_data <- html_text(tweets_data_html)
Let’s have a look at the tweets
head(tweets_data)
## [1] "Agnes, your great boy Ronald is looking down, very proud of you!pic.twitter.com/BHPu6IIdAN"
## [2] "“Trump 2020 baby!”\n\n@realDonaldTrump will win in 2020, and he’s going to do it with the help of millions of Black voters! @TheDailyShow #woke https://twitter.com/katrinapierson/status/1198600473663676416 …pic.twitter.com/QCzuzXxBul"
## [3] "President @realDonaldTrump welcomed outstanding student athletes and coaches from all over the Nation to the White House for @NCAA Collegiate National Champions Day! pic.twitter.com/347AnR0KMk"
## [4] "“That he is unconventional is not a bad thing for me. Heck, it’s why I voted for him.”\n\nFantastic read from @SalenaZito on @realDonaldTrump’s unwavering support in the Rust Belt.https://nypost.com/2019/11/23/rust-belt-voters-on-trump-id-vote-for-him-again-in-a-heartbeat/ …"
## [5] "....honors that he has earned, including his Trident Pin. Admiral and now Ambassador to Norway Ken Braithwaite will be nominated by me to be the new Secretary of the Navy. A man of great achievement and success, I know Ken will do an outstanding job!"
## [6] "....contracting procedures were not addressed to my satisfaction. Therefore, Secretary of the Navy Richard Spencer’s services have been terminated by Secretary of Defense Mark Esper. I thank Richard for his service & commitment. Eddie will retire peacefully with all of the....."
First use CSS selectors to scrape the tweets times (time since tweet)
times_data_html <- html_nodes(webpage, '.js-relative-timestamp')
Convert the data to text
times_data <- html_text(times_data_html)
Let’s have a look at the times
head(times_data)
## [1] "17h" "22h" "15h" "9h" "12h" "12h"
Loading relevant Packages
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
Convert to a corpus
speech = c(tweets_data)
modi<-Corpus(VectorSource(speech))
# Clean the data
modi_data<-tm_map(modi,stripWhitespace)
## Warning in tm_map.SimpleCorpus(modi, stripWhitespace): transformation drops
## documents
modi_data<-tm_map(modi_data,tolower)
## Warning in tm_map.SimpleCorpus(modi_data, tolower): transformation drops
## documents
modi_data<-tm_map(modi_data,removeNumbers)
## Warning in tm_map.SimpleCorpus(modi_data, removeNumbers): transformation
## drops documents
modi_data<-tm_map(modi_data,removePunctuation)
## Warning in tm_map.SimpleCorpus(modi_data, removePunctuation):
## transformation drops documents
modi_data<-tm_map(modi_data,removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(modi_data, removeWords,
## stopwords("english")): transformation drops documents
Create a term-document matrix. It is a mathematical matrix that describes the frequency of terms that occur in a collection of documents. In a document-term matrix, rows correspond to words in the collection and columns correspond to documents.
tdm_modi<-TermDocumentMatrix (modi_data) #Creates a TDM
TDM1<-as.matrix(tdm_modi) #Convert this into a matrix format
Examine the frequencies for every word
v = sort(rowSums(TDM1), decreasing = TRUE)
head(v)
## … democrats impeachment will
## 6 6 6 5
## great realdonaldtrump
## 4 4
par(mar=c(0,0,0,0))
wordcloud(modi_data, scale=c(5,0.5), max.words=100,
random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))
## Warning in wordcloud(modi_data, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : impeachment could not be fit on page. It will not
## be plotted.
The below is from https://psyteachr.github.io/hack-your-data/scrape-twitter.html for more
library(rtweet)
library(tidytext)
library(ggpubr)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Loading required package: magrittr
library(tidyverse)
## ── Attaching packages ──────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.3 ✔ purrr 0.3.2
## ✔ tidyr 1.0.0 ✔ dplyr 0.8.3
## ✔ readr 1.3.1 ✔ stringr 1.4.0
## ✔ tibble 2.1.3 ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ purrr::flatten() masks rtweet::flatten()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::pluck() masks rvest::pluck()
## ✖ purrr::set_names() masks magrittr::set_names()
trump<- get_timeline("@realdonaldtrump", n=3200)
tidy_tweets <- trump %>% # pipe data frame
filter(is_retweet==FALSE)%>% # only include original tweets
select(status_id,
text)%>% # select variables of interest
unnest_tokens(word, text) # splits column in one token per row format
no_numbers <- tidy_tweets %>%
filter(is.na(as.numeric(word))) # remember filter() returns rows where conditions are true
## Warning: NAs introduced by coercion
no_stop_words <- no_numbers %>%
anti_join(stop_words, by = "word")
Let’s first have a look at the lexicon we will be using: nrc.
#install.packages('textdata')
library(textdata)
nrc <- get_sentiments("nrc") # get specific sentiment lexicons in a tidy format
Now we want to add find out the sentiments (=emotional content) for each word in our dataframe no_stop_words
nrc_words <- no_stop_words %>%
inner_join(nrc, by="word")
Visualize:
pie_words<- nrc_words %>%
group_by(sentiment) %>% # group by sentiment type
tally %>% # counts number of rows
arrange(desc(n)) # arrange sentiments in descending order based on frequency
Plot:
ggpubr::ggpie(pie_words, "n", label = "sentiment",
fill = "sentiment", color = "white",
palette = "Spectral")
Here is another nifty tutorial: http://utstat.toronto.edu/~nathan/teaching/sta4002/Class1/scrapingtwitterinR-NT.html