Here we will scrape recent tweets from Donald Trump’s Twitter account and plot them in a word cloud

Scraping the tweets

The first step is to install and load the ‘rvest’ package

library(rvest)
## Loading required package: xml2

Note: A nifty tool that you should also install is called “selector gadget” (http://selectorgadget.com/). It will help you discover the relevant CSS parts easily

Step 1: Downloading the website

First, specify the url for the website you want to scrape:

url <- 'https://twitter.com/realDonaldTrump'

Read the HTML code from the website

webpage <- read_html(url)

Step 2: Using CSS selectors to scrape the tweets

Now we we’d like to scrape the tweet’s text. A click on the first name using “selector gadget” tells us that this field is called .tweet-text

tweets_data_html <- html_nodes(webpage, '.tweet-text')

Now we convert the data to text

tweets_data <- html_text(tweets_data_html)

Let’s have a look at the tweets

head(tweets_data)
## [1] "Agnes, your great boy Ronald is looking down, very proud of you!pic.twitter.com/BHPu6IIdAN"                                                                                                                                                                                            
## [2] "“Trump 2020 baby!”\n\n@realDonaldTrump will win in 2020, and he’s going to do it with the help of millions of Black voters! @TheDailyShow #woke https://twitter.com/katrinapierson/status/1198600473663676416 …pic.twitter.com/QCzuzXxBul"                                             
## [3] "President @realDonaldTrump welcomed outstanding student athletes and coaches from all over the Nation to the White House for @NCAA Collegiate National Champions Day! pic.twitter.com/347AnR0KMk"                                                                                      
## [4] "“That he is unconventional is not a bad thing for me. Heck, it’s why I voted for him.”\n\nFantastic read from @SalenaZito on @realDonaldTrump’s unwavering support in the Rust Belt.https://nypost.com/2019/11/23/rust-belt-voters-on-trump-id-vote-for-him-again-in-a-heartbeat/ …"   
## [5] "....honors that he has earned, including his Trident Pin. Admiral and now Ambassador to Norway Ken Braithwaite will be nominated by me to be the new Secretary of the Navy. A man of great achievement and success, I know Ken will do an outstanding job!"                            
## [6] "....contracting procedures were not addressed to my satisfaction. Therefore, Secretary of the Navy Richard Spencer’s services have been terminated by Secretary of Defense Mark Esper. I thank Richard for his service & commitment. Eddie will retire peacefully with all of the....."

Scraping the tweet’s time

First use CSS selectors to scrape the tweets times (time since tweet)

times_data_html <- html_nodes(webpage, '.js-relative-timestamp')

Convert the data to text

times_data <- html_text(times_data_html)

Let’s have a look at the times

head(times_data)
## [1] "17h" "22h" "15h" "9h"  "12h" "12h"

Result: Word cloud of trump tweets:

Loading relevant Packages

library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

Convert to a corpus

speech = c(tweets_data)
modi<-Corpus(VectorSource(speech))

# Clean the data
modi_data<-tm_map(modi,stripWhitespace)
## Warning in tm_map.SimpleCorpus(modi, stripWhitespace): transformation drops
## documents
modi_data<-tm_map(modi_data,tolower)
## Warning in tm_map.SimpleCorpus(modi_data, tolower): transformation drops
## documents
modi_data<-tm_map(modi_data,removeNumbers)
## Warning in tm_map.SimpleCorpus(modi_data, removeNumbers): transformation
## drops documents
modi_data<-tm_map(modi_data,removePunctuation)
## Warning in tm_map.SimpleCorpus(modi_data, removePunctuation):
## transformation drops documents
modi_data<-tm_map(modi_data,removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(modi_data, removeWords,
## stopwords("english")): transformation drops documents

Create a term-document matrix. It is a mathematical matrix that describes the frequency of terms that occur in a collection of documents. In a document-term matrix, rows correspond to words in the collection and columns correspond to documents.

tdm_modi<-TermDocumentMatrix (modi_data) #Creates a TDM
TDM1<-as.matrix(tdm_modi) #Convert this into a matrix format

Examine the frequencies for every word

v = sort(rowSums(TDM1), decreasing = TRUE) 
head(v)
##               …       democrats     impeachment            will 
##               6               6               6               5 
##           great realdonaldtrump 
##               4               4

Plot the word Cloud

par(mar=c(0,0,0,0))
wordcloud(modi_data, scale=c(5,0.5), max.words=100,
          random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))
## Warning in wordcloud(modi_data, scale = c(5, 0.5), max.words = 100,
## random.order = FALSE, : impeachment could not be fit on page. It will not
## be plotted.

Another approach more specific to Twitter:

The below is from https://psyteachr.github.io/hack-your-data/scrape-twitter.html for more

library(rtweet) 
library(tidytext)
library(ggpubr) 
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## Loading required package: magrittr
library(tidyverse) 
## ── Attaching packages ──────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  2.1.3     ✔ purrr   0.3.2
## ✔ tidyr   1.0.0     ✔ dplyr   0.8.3
## ✔ readr   1.3.1     ✔ stringr 1.4.0
## ✔ tibble  2.1.3     ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate()     masks NLP::annotate()
## ✖ tidyr::extract()        masks magrittr::extract()
## ✖ dplyr::filter()         masks stats::filter()
## ✖ purrr::flatten()        masks rtweet::flatten()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ✖ purrr::pluck()          masks rvest::pluck()
## ✖ purrr::set_names()      masks magrittr::set_names()
trump<- get_timeline("@realdonaldtrump", n=3200)

We need to restructure trump as one-token-per-row format

tidy_tweets <- trump %>% # pipe data frame 
    filter(is_retweet==FALSE)%>% # only include original tweets
  select(status_id, 
         text)%>% # select variables of interest
  unnest_tokens(word, text) # splits column in one token per row format

Remove numbers

no_numbers <- tidy_tweets %>%
    filter(is.na(as.numeric(word))) # remember filter() returns rows where conditions are true
## Warning: NAs introduced by coercion

Remove stopwords

no_stop_words <- no_numbers %>%
  anti_join(stop_words, by = "word")

Sentiment analysis

Let’s first have a look at the lexicon we will be using: nrc.

#install.packages('textdata')
library(textdata)
nrc <- get_sentiments("nrc") # get specific sentiment lexicons in a tidy format

Now we want to add find out the sentiments (=emotional content) for each word in our dataframe no_stop_words

nrc_words <- no_stop_words %>%
  inner_join(nrc, by="word")

Visualize:

pie_words<- nrc_words %>%
  group_by(sentiment) %>% # group by sentiment type
  tally %>% # counts number of rows
  arrange(desc(n)) # arrange sentiments in descending order based on frequency

Plot:

ggpubr::ggpie(pie_words, "n", label = "sentiment", 
      fill = "sentiment", color = "white", 
      palette = "Spectral")

Here is another nifty tutorial: http://utstat.toronto.edu/~nathan/teaching/sta4002/Class1/scrapingtwitterinR-NT.html