sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.2  magrittr_1.5    tools_3.6.2     htmltools_0.4.0
##  [5] yaml_2.2.0      Rcpp_1.0.3      stringi_1.4.5   rmarkdown_2.1  
##  [9] knitr_1.27      stringr_1.4.0   xfun_0.12       digest_0.6.23  
## [13] rlang_0.4.2     evaluate_0.14

Load tidyverse and other packages for this lecture:

library("tidyverse")
## ── Attaching packages ────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.3
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("rvest")
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library("quantmod")
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Version 0.4-0 included new data defaults. See ?getSymbols.

Web scraping

There is a wealth of data on internet. How to scrape them and analyze them?

rvest

rvest is an R package written by Hadley Wickham which makes web scraping easy.

Example: Scraping from webpage

Rank

  • Use SelectorGadget to highlight the element we want to scrape

  • Use the CSS selector to get the rankings

    # Use CSS selectors to scrap the rankings section
    (rank_data_html <- html_nodes(webpage, '.text-primary'))
    ## {xml_nodeset (100)}
    ##  [1] <span class="lister-item-index unbold text-primary">1.</span>
    ##  [2] <span class="lister-item-index unbold text-primary">2.</span>
    ##  [3] <span class="lister-item-index unbold text-primary">3.</span>
    ##  [4] <span class="lister-item-index unbold text-primary">4.</span>
    ##  [5] <span class="lister-item-index unbold text-primary">5.</span>
    ##  [6] <span class="lister-item-index unbold text-primary">6.</span>
    ##  [7] <span class="lister-item-index unbold text-primary">7.</span>
    ##  [8] <span class="lister-item-index unbold text-primary">8.</span>
    ##  [9] <span class="lister-item-index unbold text-primary">9.</span>
    ## [10] <span class="lister-item-index unbold text-primary">10.</span>
    ## [11] <span class="lister-item-index unbold text-primary">11.</span>
    ## [12] <span class="lister-item-index unbold text-primary">12.</span>
    ## [13] <span class="lister-item-index unbold text-primary">13.</span>
    ## [14] <span class="lister-item-index unbold text-primary">14.</span>
    ## [15] <span class="lister-item-index unbold text-primary">15.</span>
    ## [16] <span class="lister-item-index unbold text-primary">16.</span>
    ## [17] <span class="lister-item-index unbold text-primary">17.</span>
    ## [18] <span class="lister-item-index unbold text-primary">18.</span>
    ## [19] <span class="lister-item-index unbold text-primary">19.</span>
    ## [20] <span class="lister-item-index unbold text-primary">20.</span>
    ## ...
    # Convert the ranking data to text
    (rank_data <- html_text(rank_data_html))
    ##   [1] "1."   "2."   "3."   "4."   "5."   "6."   "7."   "8."   "9."   "10." 
    ##  [11] "11."  "12."  "13."  "14."  "15."  "16."  "17."  "18."  "19."  "20." 
    ##  [21] "21."  "22."  "23."  "24."  "25."  "26."  "27."  "28."  "29."  "30." 
    ##  [31] "31."  "32."  "33."  "34."  "35."  "36."  "37."  "38."  "39."  "40." 
    ##  [41] "41."  "42."  "43."  "44."  "45."  "46."  "47."  "48."  "49."  "50." 
    ##  [51] "51."  "52."  "53."  "54."  "55."  "56."  "57."  "58."  "59."  "60." 
    ##  [61] "61."  "62."  "63."  "64."  "65."  "66."  "67."  "68."  "69."  "70." 
    ##  [71] "71."  "72."  "73."  "74."  "75."  "76."  "77."  "78."  "79."  "80." 
    ##  [81] "81."  "82."  "83."  "84."  "85."  "86."  "87."  "88."  "89."  "90." 
    ##  [91] "91."  "92."  "93."  "94."  "95."  "96."  "97."  "98."  "99."  "100."
    # Turn into numerical values
    (rank_data <- as.integer(rank_data))
    ##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
    ##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
    ##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
    ##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
    ##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
    ##  [91]  91  92  93  94  95  96  97  98  99 100

Title

  • Use SelectorGadget to find the CSS selector .lister-item-header a.

    # Using CSS selectors to scrap the title section
    (title_data_html <- html_nodes(webpage, '.lister-item-header a'))
    ## {xml_nodeset (100)}
    ##  [1] <a href="/title/tt5727208/?ref_=adv_li_tt">Uncut Gems</a>
    ##  [2] <a href="/title/tt8579674/?ref_=adv_li_tt">1917</a>
    ##  [3] <a href="/title/tt6751668/?ref_=adv_li_tt">Parasite</a>
    ##  [4] <a href="/title/tt2584384/?ref_=adv_li_tt">Jojo Rabbit</a>
    ##  [5] <a href="/title/tt8367814/?ref_=adv_li_tt">The Gentlemen</a>
    ##  [6] <a href="/title/tt7131622/?ref_=adv_li_tt">Once Upon a Time... in Hollyw ...
    ##  [7] <a href="/title/tt3281548/?ref_=adv_li_tt">Little Women</a>
    ##  [8] <a href="/title/tt7286456/?ref_=adv_li_tt">Joker</a>
    ##  [9] <a href="/title/tt8946378/?ref_=adv_li_tt">Knives Out</a>
    ## [10] <a href="/title/tt1950186/?ref_=adv_li_tt">Ford v Ferrari</a>
    ## [11] <a href="/title/tt5606664/?ref_=adv_li_tt">Doctor Sleep</a>
    ## [12] <a href="/title/tt8688634/?ref_=adv_li_tt">21 Bridges</a>
    ## [13] <a href="/title/tt7984734/?ref_=adv_li_tt">The Lighthouse</a>
    ## [14] <a href="/title/tt1302006/?ref_=adv_li_tt">The Irishman</a>
    ## [15] <a href="/title/tt6105098/?ref_=adv_li_tt">The Lion King</a>
    ## [16] <a href="/title/tt6924650/?ref_=adv_li_tt">Midway</a>
    ## [17] <a href="/title/tt7653254/?ref_=adv_li_tt">Marriage Story</a>
    ## [18] <a href="/title/tt6450804/?ref_=adv_li_tt">Terminator: Dark Fate</a>
    ## [19] <a href="/title/tt3224458/?ref_=adv_li_tt">A Beautiful Day in the Neighb ...
    ## [20] <a href="/title/tt2527338/?ref_=adv_li_tt">Star Wars: Episode IX - The R ...
    ## ...
    # Converting the title data to text
    (title_data <- html_text(title_data_html))
    ##   [1] "Uncut Gems"                                   
    ##   [2] "1917"                                         
    ##   [3] "Parasite"                                     
    ##   [4] "Jojo Rabbit"                                  
    ##   [5] "The Gentlemen"                                
    ##   [6] "Once Upon a Time... in Hollywood"             
    ##   [7] "Little Women"                                 
    ##   [8] "Joker"                                        
    ##   [9] "Knives Out"                                   
    ##  [10] "Ford v Ferrari"                               
    ##  [11] "Doctor Sleep"                                 
    ##  [12] "21 Bridges"                                   
    ##  [13] "The Lighthouse"                               
    ##  [14] "The Irishman"                                 
    ##  [15] "The Lion King"                                
    ##  [16] "Midway"                                       
    ##  [17] "Marriage Story"                               
    ##  [18] "Terminator: Dark Fate"                        
    ##  [19] "A Beautiful Day in the Neighborhood"          
    ##  [20] "Star Wars: Episode IX - The Rise of Skywalker"
    ##  [21] "Alita: Battle Angel"                          
    ##  [22] "Midsommar"                                    
    ##  [23] "Bombshell"                                    
    ##  [24] "The Jesus Rolls"                              
    ##  [25] "Hustlers"                                     
    ##  [26] "Toy Story 4"                                  
    ##  [27] "Jumanji: The Next Level"                      
    ##  [28] "Avengers: Endgame"                            
    ##  [29] "Judy"                                         
    ##  [30] "The Lodge"                                    
    ##  [31] "Honey Boy"                                    
    ##  [32] "6 Underground"                                
    ##  [33] "Ad Astra"                                     
    ##  [34] "Harriet"                                      
    ##  [35] "Zombieland: Double Tap"                       
    ##  [36] "Cats"                                         
    ##  [37] "Just Mercy"                                   
    ##  [38] "Fighting with My Family"                      
    ##  [39] "Richard Jewell"                               
    ##  [40] "The Two Popes"                                
    ##  [41] "Frozen II"                                    
    ##  [42] "Motherless Brooklyn"                          
    ##  [43] "Come to Daddy"                                
    ##  [44] "Aladdin"                                      
    ##  [45] "Fast & Furious Presents: Hobbs & Shaw"        
    ##  [46] "Gemini Man"                                   
    ##  [47] "Dark Waters"                                  
    ##  [48] "Color Out of Space"                           
    ##  [49] "Maleficent: Mistress of Evil"                 
    ##  [50] "Booksmart"                                    
    ##  [51] "It Chapter Two"                               
    ##  [52] "Shazam!"                                      
    ##  [53] "The Last Full Measure"                        
    ##  [54] "Last Christmas"                               
    ##  [55] "John Wick: Chapter 3 - Parabellum"            
    ##  [56] "Waves"                                        
    ##  [57] "Jay and Silent Bob Reboot"                    
    ##  [58] "Pain and Glory"                               
    ##  [59] "Playing with Fire"                            
    ##  [60] "Spider-Man: Far from Home"                    
    ##  [61] "Queen & Slim"                                 
    ##  [62] "Portrait of a Lady on Fire"                   
    ##  [63] "The Good Liar"                                
    ##  [64] "Ready or Not"                                 
    ##  [65] "The King"                                     
    ##  [66] "Downton Abbey"                                
    ##  [67] "Troop Zero"                                   
    ##  [68] "Rambo: Last Blood"                            
    ##  [69] "Countdown"                                    
    ##  [70] "The Personal History of David Copperfield"    
    ##  [71] "Rocketman"                                    
    ##  [72] "A Hidden Life"                                
    ##  [73] "Bait"                                         
    ##  [74] "Us"                                           
    ##  [75] "The Peanut Butter Falcon"                     
    ##  [76] "After"                                        
    ##  [77] "Long Shot"                                    
    ##  [78] "Klaus"                                        
    ##  [79] "Godzilla: King of the Monsters"               
    ##  [80] "Captain Marvel"                               
    ##  [81] "True History of the Kelly Gang"               
    ##  [82] "The Addams Family"                            
    ##  [83] "Corpus Christi"                               
    ##  [84] "Mr. Jones"                                    
    ##  [85] "Code 8"                                       
    ##  [86] "Anna"                                         
    ##  [87] "Men in Black: International"                  
    ##  [88] "Angel Has Fallen"                             
    ##  [89] "Yesterday"                                    
    ##  [90] "Togo"                                         
    ##  [91] "Good Boys"                                    
    ##  [92] "The Farewell"                                 
    ##  [93] "Guns Akimbo"                                  
    ##  [94] "Charlie's Angels"                             
    ##  [95] "Monos"                                        
    ##  [96] "Hellboy"                                      
    ##  [97] "Ip Man 4: The Finale"                         
    ##  [98] "Spies in Disguise"                            
    ##  [99] "Extremely Wicked, Shockingly Evil and Vile"   
    ## [100] "Vivarium"

Description

  • # Using CSS selectors to scrap the description section
    (description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
    ## {xml_nodeset (100)}
    ##  [1] <p class="text-muted">\n    A charismatic New York City jeweler always o ...
    ##  [2] <p class="text-muted">\n    April 6th, 1917. As a regiment assembles to  ...
    ##  [3] <p class="text-muted">\n    A poor family, the Kims, con their way into  ...
    ##  [4] <p class="text-muted">\n    A young boy in Hitler's army finds out his m ...
    ##  [5] <p class="text-muted">\n    An American expat tries to sell off his high ...
    ##  [6] <p class="text-muted">\n    A faded television actor and his stunt doubl ...
    ##  [7] <p class="text-muted">\n    Jo March reflects back and forth on her life ...
    ##  [8] <p class="text-muted">\n    In Gotham City, mentally troubled comedian A ...
    ##  [9] <p class="text-muted">\n    A detective investigates the death of a patr ...
    ## [10] <p class="text-muted">\n    American car designer <a href="/name/nm07909 ...
    ## [11] <p class="text-muted">\n    Years following the events of "The Shining," ...
    ## [12] <p class="text-muted">\n    An embattled NYPD detective is thrust into a ...
    ## [13] <p class="text-muted">\n    Two lighthouse keepers try to maintain their ...
    ## [14] <p class="text-muted">\n    A mob hitman recalls his friend <a href="/na ...
    ## [15] <p class="text-muted">\n    After the murder of his father, a young lion ...
    ## [16] <p class="text-muted">\n    The story of the Battle of Midway, told by t ...
    ## [17] <p class="text-muted">\n    Noah Baumbach's incisive and compassionate l ...
    ## [18] <p class="text-muted">\n    An augmented human and Sarah Connor must sto ...
    ## [19] <p class="text-muted">\n    Based on the true story of a real-life frien ...
    ## [20] <p class="text-muted">\n    The surviving members of the resistance face ...
    ## ...
    # Converting the description data to text
    description_data <- html_text(description_data_html)
    # take a look at first few
    head(description_data)
    ## [1] "\n    A charismatic New York City jeweler always on the lookout for the next big score makes a series of high-stakes bets that could lead to the windfall of a lifetime. Howard must perform a precarious high-wire act, balancing business, family, and encroaching adversaries on all sides in his relentless pursuit of the ultimate win."
    ## [2] "\n    April 6th, 1917. As a regiment assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap."                                                                                                                 
    ## [3] "\n    A poor family, the Kims, con their way into becoming the servants of a rich family, the Parks. But their easy life gets complicated when their deception is threatened with exposure."                                                                                                                                                 
    ## [4] "\n    A young boy in Hitler's army finds out his mother is hiding a Jewish girl in their home."                                                                                                                                                                                                                                              
    ## [5] "\n    An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                                                                                                                          
    ## [6] "\n    A faded television actor and his stunt double strive to achieve fame and success in the film industry during the final years of Hollywood's Golden Age in 1969 Los Angeles."
    # strip the '\n'
    description_data <- str_replace(description_data, "^\\n\\s+", "")
    head(description_data)
    ## [1] "A charismatic New York City jeweler always on the lookout for the next big score makes a series of high-stakes bets that could lead to the windfall of a lifetime. Howard must perform a precarious high-wire act, balancing business, family, and encroaching adversaries on all sides in his relentless pursuit of the ultimate win."
    ## [2] "April 6th, 1917. As a regiment assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap."                                                                                                                 
    ## [3] "A poor family, the Kims, con their way into becoming the servants of a rich family, the Parks. But their easy life gets complicated when their deception is threatened with exposure."                                                                                                                                                 
    ## [4] "A young boy in Hitler's army finds out his mother is hiding a Jewish girl in their home."                                                                                                                                                                                                                                              
    ## [5] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                                                                                                                          
    ## [6] "A faded television actor and his stunt double strive to achieve fame and success in the film industry during the final years of Hollywood's Golden Age in 1969 Los Angeles."

Runtime

# Using CSS selectors to scrap the Movie runtime section
(runtime_data <- webpage %>%
  html_nodes('.runtime') %>%
  html_text() %>%
  str_replace(" min", "") %>%
  as.integer())
##   [1] 135 119 132 108 113 161 135 122 131 152 152  99 109 209 118 138 137 128
##  [19] 109 142 122 147 109  85 110 100 123 181 118 108  94 128 123 125  99 110
##  [37] 137 108 131 125 103 144  96 128 137 117 126 111 119 102 169 132 116 103
##  [55] 131 135 105 113  96 129 132 121 109  95 140 122  94  89  90 119 121 174
##  [73]  89 116  97 105 125  96 132 123 124  86 115 141  98 119 114 121 116 113
##  [91]  90 100  95 118 102 120 105 102 110  97
```r
# Using CSS selectors to scrap the Movie runtime section
runtime_data_html <- html_nodes(webpage, '.runtime')
# Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
# Let's have a look at the runtime
head(runtime_data)
```

```
## [1] "135 min" "119 min" "132 min" "108 min" "113 min" "161 min"
```

```r
# Data-Preprocessing: removing mins and converting it to numerical
runtime_data <- str_replace(runtime_data, " min", "")
runtime_data <- as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
```

```
## [1] 135 119 132 108 113 161
```

Genre

  • Collect the (first) genre of each movie:

    # Using CSS selectors to scrap the Movie genre section
    genre_data_html <- html_nodes(webpage, '.genre')
    # Converting the genre data to text
    genre_data <- html_text(genre_data_html)
    # Let's have a look at the genre data
    head(genre_data)    
    ## [1] "\nCrime, Drama, Thriller            "
    ## [2] "\nDrama, War            "            
    ## [3] "\nComedy, Crime, Drama            "  
    ## [4] "\nComedy, Drama, War            "    
    ## [5] "\nAction, Crime            "         
    ## [6] "\nComedy, Drama            "
    # Data-Preprocessing: retrieve the first word
    genre_data <- str_extract(genre_data, "[:alpha:]+")
    # Convering each genre from text to factor
    #genre_data <- as.factor(genre_data)
    # Let's have another look at the genre data
    head(genre_data)
    ## [1] "Crime"  "Drama"  "Comedy" "Comedy" "Action" "Comedy"

Rating

  • # Using CSS selectors to scrap the IMDB rating section
    rating_data_html <- html_nodes(webpage, '.ratings-imdb-rating strong')
    # Converting the ratings data to text
    rating_data <- html_text(rating_data_html)
    # Let's have a look at the ratings
    head(rating_data)
    ## [1] "7.7" "8.5" "8.6" "8.0" "8.1" "7.7"
    # Data-Preprocessing: converting ratings to numerical
    rating_data <- as.numeric(rating_data)
    # Let's have another look at the ratings data
    rating_data
    ##   [1] 7.7 8.5 8.6 8.0 8.1 7.7 8.1 8.6 8.0 8.2 7.5 6.6 7.7 8.0 6.9 6.8 8.0 6.3
    ##  [19] 7.4 6.9 7.3 7.2 6.8 4.5 6.4 7.9 7.0 8.5 7.0 6.6 7.4 6.1 6.6 6.3 6.8 2.8
    ##  [37] 7.5 7.1 7.5 7.6 7.1 6.9 6.2 7.0 6.5 5.7 7.6 6.3 6.7 7.2 6.6 7.1 6.5 6.5
    ##  [55] 7.5 7.7 5.8 7.6 4.8 7.6 7.1 8.2 6.5 6.9 7.3 7.4 7.0 6.2 5.3 6.6 7.4 7.6
    ##  [73] 7.3 6.9 7.7 5.4 6.9 8.2 6.1 6.9 5.9 5.8 7.8 7.0 6.1 6.6 5.6 6.4 6.9 8.1
    ##  [91] 6.7 7.7 7.5 4.2 7.2 5.2 7.4 6.8 6.6 6.4

Votes

  • # Using CSS selectors to scrap the votes section
    votes_data_html <- html_nodes(webpage, '.sort-num_votes-visible span:nth-child(2)')
    # Converting the votes data to text
    votes_data <- html_text(votes_data_html)
    # Let's have a look at the votes data
    head(votes_data)
    ## [1] "83,566"  "166,010" "216,718" "104,999" "28,099"  "376,301"
    # Data-Preprocessing: removing commas
    votes_data <- str_replace(votes_data, ",", "")
    # Data-Preprocessing: converting votes to numerical
    votes_data <- as.numeric(votes_data)
    #Let's have another look at the votes data
    votes_data
    ##   [1]  83566 166010 216718 104999  28099 376301  54597 666811 157627 130613
    ##  [11]  68685  12102  68873 235933 163671  24771 164023  90804  22450 252611
    ##  [21] 187763 113708  28806    415  51271 152638  74662 663960  20865   1670
    ##  [31]   7438  98131 140689   9711  80932  25935   9666  50780  16275  70005
    ##  [41]  66365  19253   1742 184442 132024  52581  11148   4923  52000  63862
    ##  [51] 155862 214675   1764  25895 222554   6121   9089  32392   5016 261768
    ##  [61]   7353  14721   9909  62581  62603  26857   3121  55710  12229   2483
    ##  [71]  94514   5392   1578 181696  32194  23500  69513  55689 107850 393541
    ##  [81]   1132  14880   2620   1358  11212  39427  85103  50720  77311  15937
    ##  [91]  40581  28467   1506  18344   6495  66399   4749   6450  63633    815

Director

  • # Using CSS selectors to scrap the directors section
    (directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)'))
    ## {xml_nodeset (100)}
    ##  [1] <a href="/name/nm1509478/?ref_=adv_li_dr_0">Benny Safdie</a>
    ##  [2] <a href="/name/nm0005222/?ref_=adv_li_dr_0">Sam Mendes</a>
    ##  [3] <a href="/name/nm0094435/?ref_=adv_li_dr_0">Bong Joon Ho</a>
    ##  [4] <a href="/name/nm0169806/?ref_=adv_li_dr_0">Taika Waititi</a>
    ##  [5] <a href="/name/nm0005363/?ref_=adv_li_dr_0">Guy Ritchie</a>
    ##  [6] <a href="/name/nm0000233/?ref_=adv_li_dr_0">Quentin Tarantino</a>
    ##  [7] <a href="/name/nm1950086/?ref_=adv_li_dr_0">Greta Gerwig</a>
    ##  [8] <a href="/name/nm0680846/?ref_=adv_li_dr_0">Todd Phillips</a>
    ##  [9] <a href="/name/nm0426059/?ref_=adv_li_dr_0">Rian Johnson</a>
    ## [10] <a href="/name/nm0003506/?ref_=adv_li_dr_0">James Mangold</a>
    ## [11] <a href="/name/nm1093039/?ref_=adv_li_dr_0">Mike Flanagan</a>
    ## [12] <a href="/name/nm1047532/?ref_=adv_li_dr_0">Brian Kirk</a>
    ## [13] <a href="/name/nm3211470/?ref_=adv_li_dr_0">Robert Eggers</a>
    ## [14] <a href="/name/nm0000217/?ref_=adv_li_dr_0">Martin Scorsese</a>
    ## [15] <a href="/name/nm0269463/?ref_=adv_li_dr_0">Jon Favreau</a>
    ## [16] <a href="/name/nm0000386/?ref_=adv_li_dr_0">Roland Emmerich</a>
    ## [17] <a href="/name/nm0000876/?ref_=adv_li_dr_0">Noah Baumbach</a>
    ## [18] <a href="/name/nm1783265/?ref_=adv_li_dr_0">Tim Miller</a>
    ## [19] <a href="/name/nm1716636/?ref_=adv_li_dr_0">Marielle Heller</a>
    ## [20] <a href="/name/nm0009190/?ref_=adv_li_dr_0">J.J. Abrams</a>
    ## ...
    # Converting the directors data to text
    directors_data <- html_text(directors_data_html)
    # Let's have a look at the directors data
    directors_data
    ##   [1] "Benny Safdie"          "Sam Mendes"            "Bong Joon Ho"         
    ##   [4] "Taika Waititi"         "Guy Ritchie"           "Quentin Tarantino"    
    ##   [7] "Greta Gerwig"          "Todd Phillips"         "Rian Johnson"         
    ##  [10] "James Mangold"         "Mike Flanagan"         "Brian Kirk"           
    ##  [13] "Robert Eggers"         "Martin Scorsese"       "Jon Favreau"          
    ##  [16] "Roland Emmerich"       "Noah Baumbach"         "Tim Miller"           
    ##  [19] "Marielle Heller"       "J.J. Abrams"           "Robert Rodriguez"     
    ##  [22] "Ari Aster"             "Jay Roach"             "John Turturro"        
    ##  [25] "Lorene Scafaria"       "Josh Cooley"           "Jake Kasdan"          
    ##  [28] "Anthony Russo"         "Rupert Goold"          "Severin Fiala"        
    ##  [31] "Alma Har'el"           "Michael Bay"           "James Gray"           
    ##  [34] "Kasi Lemmons"          "Ruben Fleischer"       "Tom Hooper"           
    ##  [37] "Destin Daniel Cretton" "Stephen Merchant"      "Clint Eastwood"       
    ##  [40] "Fernando Meirelles"    "Chris Buck"            "Edward Norton"        
    ##  [43] "Ant Timpson"           "Guy Ritchie"           "David Leitch"         
    ##  [46] "Ang Lee"               "Todd Haynes"           "Richard Stanley"      
    ##  [49] "Joachim Rønning"       "Olivia Wilde"          "Andy Muschietti"      
    ##  [52] "David F. Sandberg"     "Todd Robinson"         "Paul Feig"            
    ##  [55] "Chad Stahelski"        "Trey Edward Shults"    "Kevin Smith"          
    ##  [58] "Pedro Almodóvar"       "Andy Fickman"          "Jon Watts"            
    ##  [61] "Melina Matsoukas"      "Céline Sciamma"        "Bill Condon"          
    ##  [64] "Matt Bettinelli-Olpin" "David Michôd"          "Michael Engler"       
    ##  [67] "Bert"                  "Adrian Grunberg"       "Justin Dec"           
    ##  [70] "Armando Iannucci"      "Dexter Fletcher"       "Terrence Malick"      
    ##  [73] "Mark Jenkin"           "Jordan Peele"          "Tyler Nilson"         
    ##  [76] "Jenny Gage"            "Jonathan Levine"       "Sergio Pablos"        
    ##  [79] "Michael Dougherty"     "Anna Boden"            "Justin Kurzel"        
    ##  [82] "Greg Tiernan"          "Jan Komasa"            "Agnieszka Holland"    
    ##  [85] "Jeff Chan"             "Luc Besson"            "F. Gary Gray"         
    ##  [88] "Ric Roman Waugh"       "Danny Boyle"           "Ericson Core"         
    ##  [91] "Gene Stupnitsky"       "Lulu Wang"             "Jason Lei Howden"     
    ##  [94] "Elizabeth Banks"       "Alejandro Landes"      "Neil Marshall"        
    ##  [97] "Wilson Yip"            "Nick Bruno"            "Joe Berlinger"        
    ## [100] "Lorcan Finnegan"

Actor

  • # Using CSS selectors to scrap the actors section
    (actors_data_html <- html_nodes(webpage, '.lister-item-content .ghost+ a'))
    ## {xml_nodeset (100)}
    ##  [1] <a href="/name/nm11243726/?ref_=adv_li_st_0">Mesfin Lamengo</a>
    ##  [2] <a href="/name/nm2835616/?ref_=adv_li_st_0">Dean-Charles Chapman</a>
    ##  [3] <a href="/name/nm0814280/?ref_=adv_li_st_0">Kang-ho Song</a>
    ##  [4] <a href="/name/nm9877392/?ref_=adv_li_st_0">Roman Griffin Davis</a>
    ##  [5] <a href="/name/nm0000190/?ref_=adv_li_st_0">Matthew McConaughey</a>
    ##  [6] <a href="/name/nm0000138/?ref_=adv_li_st_0">Leonardo DiCaprio</a>
    ##  [7] <a href="/name/nm1519680/?ref_=adv_li_st_0">Saoirse Ronan</a>
    ##  [8] <a href="/name/nm0001618/?ref_=adv_li_st_0">Joaquin Phoenix</a>
    ##  [9] <a href="/name/nm0185819/?ref_=adv_li_st_0">Daniel Craig</a>
    ## [10] <a href="/name/nm0000354/?ref_=adv_li_st_0">Matt Damon</a>
    ## [11] <a href="/name/nm0000191/?ref_=adv_li_st_0">Ewan McGregor</a>
    ## [12] <a href="/name/nm1569276/?ref_=adv_li_st_0">Chadwick Boseman</a>
    ## [13] <a href="/name/nm1500155/?ref_=adv_li_st_0">Robert Pattinson</a>
    ## [14] <a href="/name/nm0000134/?ref_=adv_li_st_0">Robert De Niro</a>
    ## [15] <a href="/name/nm2255973/?ref_=adv_li_st_0">Donald Glover</a>
    ## [16] <a href="/name/nm4534098/?ref_=adv_li_st_0">Ed Skrein</a>
    ## [17] <a href="/name/nm3485845/?ref_=adv_li_st_0">Adam Driver</a>
    ## [18] <a href="/name/nm0000157/?ref_=adv_li_st_0">Linda Hamilton</a>
    ## [19] <a href="/name/nm0000158/?ref_=adv_li_st_0">Tom Hanks</a>
    ## [20] <a href="/name/nm0000402/?ref_=adv_li_st_0">Carrie Fisher</a>
    ## ...
    # Converting the gross actors data to text
    actors_data <- html_text(actors_data_html)
    # Let's have a look at the actors data
    head(actors_data)
    ## [1] "Mesfin Lamengo"       "Dean-Charles Chapman" "Kang-ho Song"        
    ## [4] "Roman Griffin Davis"  "Matthew McConaughey"  "Leonardo DiCaprio"

Metascore

  • Be careful with missing data.

    # Using CSS selectors to scrap the metascore section
    metascore_data_html <- html_nodes(webpage, '.metascore')
    # Converting the runtime data to text
    metascore_data <- html_text(metascore_data_html)
    # Let's have a look at the metascore 
    head(metascore_data)
    ## [1] "90        " "78        " "96        " "58        " "51        "
    ## [6] "83        "
    # Data-Preprocessing: removing extra space in metascore
    metascore_data <- str_replace(metascore_data, "\\s*$", "")
    metascore_data <- as.numeric(metascore_data)
    metascore_data
    ##  [1] 90 78 96 58 51 83 91 59 82 81 59 51 83 94 55 47 93 54 80 53 53 72 64 79 84
    ## [26] 58 78 66 63 73 41 80 66 55 32 68 68 68 75 64 60 65 53 60 38 73 70 43 84 58
    ## [51] 71 52 50 73 80 46 87 24 69 74 95 55 64 62 64 59 26 31 75 69 78 84 81 70 30
    ## [76] 67 65 48 64 84 46 69 62 40 38 45 55 71 60 89 43 52 78 31 62 54 52
    # Lets check the length of metascore data
    length(metascore_data)
    ## [1] 97
    # Visual inspection finds 24, 85, 100 don't have metascore
    ms <- rep(NA, 100)
    ms[-c(24, 85, 100)] <- metascore_data
    (metascore_data <- ms)
    ##   [1] 90 78 96 58 51 83 91 59 82 81 59 51 83 94 55 47 93 54 80 53 53 72 64 NA 79
    ##  [26] 84 58 78 66 63 73 41 80 66 55 32 68 68 68 75 64 60 65 53 60 38 73 70 43 84
    ##  [51] 58 71 52 50 73 80 46 87 24 69 74 95 55 64 62 64 59 26 31 75 69 78 84 81 70
    ##  [76] 30 67 65 48 64 84 46 69 62 NA 40 38 45 55 71 60 89 43 52 78 31 62 54 52 NA

Gross

  • Be careful with missing data.

    # Using CSS selectors to scrap the gross revenue section
    gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
    # Converting the gross revenue data to text
    gross_data <- html_text(gross_data_html)
    # Let's have a look at the gross data
    head(gross_data)
    ## [1] "$0.35M"   "$135.37M" "$192.73M" "$0.43M"   "$540.08M" "$85.71M"
    # Data-Preprocessing: removing '$' and 'M' signs
    gross_data <- str_replace(gross_data, "M", "")
    gross_data <- str_sub(gross_data, 2, 10)
    #(gross_data <- str_extract(gross_data, "[:digit:]+.[:digit:]+"))
    gross_data <- as.numeric(gross_data)
    # Let's check the length of gross data
    length(gross_data)
    ## [1] 40
    # Visual inspection finds below movies don't have gross
    #gs_data <- rep(NA, 100)
    #gs_data[-c(1, 2, 3, 5, 61, 69, 71, 74, 78, 82, 84:87, 90)] <- gross_data
    #(gross_data <- gs_data)

    60 (out of 100) movies don’t have gross data yet! We need a better way to figure out missing entries.

    (rank_and_gross <- webpage %>%
      html_nodes('.ghost~ .text-muted+ span , .text-primary') %>%
      html_text() %>%
      str_replace("\\s+", "") %>%
      str_replace_all("[$M]", ""))
    ##   [1] "1."     "2."     "3."     "4."     "0.35"   "5."     "6."     "135.37"
    ##   [9] "7."     "8."     "192.73" "9."     "10."    "11."    "12."    "13."   
    ##  [17] "0.43"   "14."    "15."    "540.08" "16."    "17."    "18."    "19."   
    ##  [25] "20."    "21."    "85.71"  "22."    "27.33"  "23."    "24."    "25."   
    ##  [33] "80.55"  "26."    "433.03" "27."    "28."    "858.37" "29."    "30."   
    ##  [41] "31."    "32."    "33."    "35.40"  "34."    "35."    "26.80"  "36."   
    ##  [49] "37."    "38."    "22.96"  "39."    "40."    "41."    "42."    "43."   
    ##  [57] "44."    "354.87" "45."    "165.55" "46."    "20.55"  "47."    "48."   
    ##  [65] "49."    "36.95"  "50."    "22.68"  "51."    "193.77" "52."    "140.37"
    ##  [73] "53."    "54."    "55."    "171.02" "56."    "57."    "58."    "59."   
    ##  [81] "60."    "388.53" "61."    "62."    "63."    "64."    "26.74"  "65."   
    ##  [89] "66."    "31.03"  "67."    "68."    "18.87"  "69."    "70."    "71."   
    ##  [97] "96.37"  "72."    "73."    "74."    "175.01" "75."    "13.12"  "76."   
    ## [105] "12.14"  "77."    "30.32"  "78."    "79."    "110.50" "80."    "426.83"
    ## [113] "81."    "82."    "30.30"  "83."    "84."    "85."    "86."    "7.74"  
    ## [121] "87."    "79.80"  "88."    "67.16"  "89."    "73.29"  "90."    "91."   
    ## [129] "69.06"  "92."    "16.88"  "93."    "94."    "95."    "96."    "21.90" 
    ## [137] "97."    "98."    "99."    "100."
    isrank <- str_detect(rank_and_gross, "\\.$")
    ismissing <- isrank[1:(length(rank_and_gross) - 1)] & isrank[2:(length(rank_and_gross))]
    ismissing[length(ismissing)+1] <- isrank[length(isrank)]
    missingpos <- as.integer(rank_and_gross[ismissing])
    gs_data <- rep(NA, 100)
    gs_data[-missingpos] <- gross_data
    (gross_data <- gs_data)
    ##   [1]     NA     NA     NA   0.35     NA 135.37     NA 192.73     NA     NA
    ##  [11]     NA     NA   0.43     NA 540.08     NA     NA     NA     NA     NA
    ##  [21]  85.71  27.33     NA     NA  80.55 433.03     NA 858.37     NA     NA
    ##  [31]     NA     NA  35.40     NA  26.80     NA     NA  22.96     NA     NA
    ##  [41]     NA     NA     NA 354.87 165.55  20.55     NA     NA  36.95  22.68
    ##  [51] 193.77 140.37     NA     NA 171.02     NA     NA     NA     NA 388.53
    ##  [61]     NA     NA     NA  26.74     NA  31.03     NA  18.87     NA     NA
    ##  [71]  96.37     NA     NA 175.01  13.12  12.14  30.32     NA 110.50 426.83
    ##  [81]     NA  30.30     NA     NA     NA   7.74  79.80  67.16  73.29     NA
    ##  [91]  69.06  16.88     NA     NA     NA  21.90     NA     NA     NA     NA

Missing entries - more reproducible way

  • Following code programatically figures out missing entries for metascore.

    # Use CSS selectors to scrap the rankings section
    (rank_metascore_data_html <- html_nodes(webpage, '.unfavorable , .favorable , .mixed , .text-primary'))
    ## {xml_nodeset (197)}
    ##  [1] <span class="lister-item-index unbold text-primary">1.</span>
    ##  [2] <span class="metascore  favorable">90        </span>
    ##  [3] <span class="lister-item-index unbold text-primary">2.</span>
    ##  [4] <span class="metascore  favorable">78        </span>
    ##  [5] <span class="lister-item-index unbold text-primary">3.</span>
    ##  [6] <span class="metascore  favorable">96        </span>
    ##  [7] <span class="lister-item-index unbold text-primary">4.</span>
    ##  [8] <span class="metascore  mixed">58        </span>
    ##  [9] <span class="lister-item-index unbold text-primary">5.</span>
    ## [10] <span class="metascore  mixed">51        </span>
    ## [11] <span class="lister-item-index unbold text-primary">6.</span>
    ## [12] <span class="metascore  favorable">83        </span>
    ## [13] <span class="lister-item-index unbold text-primary">7.</span>
    ## [14] <span class="metascore  favorable">91        </span>
    ## [15] <span class="lister-item-index unbold text-primary">8.</span>
    ## [16] <span class="metascore  mixed">59        </span>
    ## [17] <span class="lister-item-index unbold text-primary">9.</span>
    ## [18] <span class="metascore  favorable">82        </span>
    ## [19] <span class="lister-item-index unbold text-primary">10.</span>
    ## [20] <span class="metascore  favorable">81        </span>
    ## ...
    # Convert the ranking data to text
    (rank_metascore_data <- html_text(rank_metascore_data_html))
    ##   [1] "1."         "90        " "2."         "78        " "3."        
    ##   [6] "96        " "4."         "58        " "5."         "51        "
    ##  [11] "6."         "83        " "7."         "91        " "8."        
    ##  [16] "59        " "9."         "82        " "10."        "81        "
    ##  [21] "11."        "59        " "12."        "51        " "13."       
    ##  [26] "83        " "14."        "94        " "15."        "55        "
    ##  [31] "16."        "47        " "17."        "93        " "18."       
    ##  [36] "54        " "19."        "80        " "20."        "53        "
    ##  [41] "21."        "53        " "22."        "72        " "23."       
    ##  [46] "64        " "24."        "25."        "79        " "26."       
    ##  [51] "84        " "27."        "58        " "28."        "78        "
    ##  [56] "29."        "66        " "30."        "63        " "31."       
    ##  [61] "73        " "32."        "41        " "33."        "80        "
    ##  [66] "34."        "66        " "35."        "55        " "36."       
    ##  [71] "32        " "37."        "68        " "38."        "68        "
    ##  [76] "39."        "68        " "40."        "75        " "41."       
    ##  [81] "64        " "42."        "60        " "43."        "65        "
    ##  [86] "44."        "53        " "45."        "60        " "46."       
    ##  [91] "38        " "47."        "73        " "48."        "70        "
    ##  [96] "49."        "43        " "50."        "84        " "51."       
    ## [101] "58        " "52."        "71        " "53."        "52        "
    ## [106] "54."        "50        " "55."        "73        " "56."       
    ## [111] "80        " "57."        "46        " "58."        "87        "
    ## [116] "59."        "24        " "60."        "69        " "61."       
    ## [121] "74        " "62."        "95        " "63."        "55        "
    ## [126] "64."        "64        " "65."        "62        " "66."       
    ## [131] "64        " "67."        "59        " "68."        "26        "
    ## [136] "69."        "31        " "70."        "75        " "71."       
    ## [141] "69        " "72."        "78        " "73."        "84        "
    ## [146] "74."        "81        " "75."        "70        " "76."       
    ## [151] "30        " "77."        "67        " "78."        "65        "
    ## [156] "79."        "48        " "80."        "64        " "81."       
    ## [161] "84        " "82."        "46        " "83."        "69        "
    ## [166] "84."        "62        " "85."        "86."        "40        "
    ## [171] "87."        "38        " "88."        "45        " "89."       
    ## [176] "55        " "90."        "71        " "91."        "60        "
    ## [181] "92."        "89        " "93."        "43        " "94."       
    ## [186] "52        " "95."        "78        " "96."        "31        "
    ## [191] "97."        "62        " "98."        "54        " "99."       
    ## [196] "52        " "100."
    # Strip spaces
    (rank_metascore_data <- str_replace(rank_metascore_data, "\\s+", ""))
    ##   [1] "1."   "90"   "2."   "78"   "3."   "96"   "4."   "58"   "5."   "51"  
    ##  [11] "6."   "83"   "7."   "91"   "8."   "59"   "9."   "82"   "10."  "81"  
    ##  [21] "11."  "59"   "12."  "51"   "13."  "83"   "14."  "94"   "15."  "55"  
    ##  [31] "16."  "47"   "17."  "93"   "18."  "54"   "19."  "80"   "20."  "53"  
    ##  [41] "21."  "53"   "22."  "72"   "23."  "64"   "24."  "25."  "79"   "26." 
    ##  [51] "84"   "27."  "58"   "28."  "78"   "29."  "66"   "30."  "63"   "31." 
    ##  [61] "73"   "32."  "41"   "33."  "80"   "34."  "66"   "35."  "55"   "36." 
    ##  [71] "32"   "37."  "68"   "38."  "68"   "39."  "68"   "40."  "75"   "41." 
    ##  [81] "64"   "42."  "60"   "43."  "65"   "44."  "53"   "45."  "60"   "46." 
    ##  [91] "38"   "47."  "73"   "48."  "70"   "49."  "43"   "50."  "84"   "51." 
    ## [101] "58"   "52."  "71"   "53."  "52"   "54."  "50"   "55."  "73"   "56." 
    ## [111] "80"   "57."  "46"   "58."  "87"   "59."  "24"   "60."  "69"   "61." 
    ## [121] "74"   "62."  "95"   "63."  "55"   "64."  "64"   "65."  "62"   "66." 
    ## [131] "64"   "67."  "59"   "68."  "26"   "69."  "31"   "70."  "75"   "71." 
    ## [141] "69"   "72."  "78"   "73."  "84"   "74."  "81"   "75."  "70"   "76." 
    ## [151] "30"   "77."  "67"   "78."  "65"   "79."  "48"   "80."  "64"   "81." 
    ## [161] "84"   "82."  "46"   "83."  "69"   "84."  "62"   "85."  "86."  "40"  
    ## [171] "87."  "38"   "88."  "45"   "89."  "55"   "90."  "71"   "91."  "60"  
    ## [181] "92."  "89"   "93."  "43"   "94."  "52"   "95."  "78"   "96."  "31"  
    ## [191] "97."  "62"   "98."  "54"   "99."  "52"   "100."
    # a rank followed by another rank means the metascore for the 1st rank is missing
    (isrank <- str_detect(rank_metascore_data, "\\.$"))
    ##   [1]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [13]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [25]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [37]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE
    ##  [49] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ##  [61] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ##  [73] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ##  [85] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ##  [97] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ## [109] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ## [121] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ## [133] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ## [145] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ## [157] FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
    ## [169]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [181]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [193]  TRUE FALSE  TRUE FALSE  TRUE
    ismissing <- isrank[1:length(rank_metascore_data)-1] & 
      isrank[2:length(rank_metascore_data)]
    ismissing[length(ismissing)+1] <- isrank[length(isrank)]
    (missingpos <- as.integer(rank_metascore_data[ismissing]))
    ## [1]  24  85 100
    #(rank_metascore_data <- as.integer(rank_metascore_data))
  • You (students) should work out the code for finding missing positions for gross.

Visualizing movie data

  • Form a tibble:

    # Combining all the lists to form a data frame
    movies <- tibble(Rank = rank_data, 
                     Title = title_data,
                     Description = description_data, 
                     Runtime = runtime_data,
                     Genre = genre_data, 
                     Rating = rating_data,
                     Metascore = metascore_data, 
                     Votes = votes_data,
                     Gross_Earning_in_Mil = gross_data,
                     Director = directors_data, 
                     Actor = actors_data)
    movies %>% print(width=Inf)
    ## # A tibble: 100 x 11
    ##     Rank Title                           
    ##    <int> <chr>                           
    ##  1     1 Uncut Gems                      
    ##  2     2 1917                            
    ##  3     3 Parasite                        
    ##  4     4 Jojo Rabbit                     
    ##  5     5 The Gentlemen                   
    ##  6     6 Once Upon a Time... in Hollywood
    ##  7     7 Little Women                    
    ##  8     8 Joker                           
    ##  9     9 Knives Out                      
    ## 10    10 Ford v Ferrari                  
    ##    Description                                                                  
    ##    <chr>                                                                        
    ##  1 A charismatic New York City jeweler always on the lookout for the next big s…
    ##  2 April 6th, 1917. As a regiment assembles to wage war deep in enemy territory…
    ##  3 A poor family, the Kims, con their way into becoming the servants of a rich …
    ##  4 A young boy in Hitler's army finds out his mother is hiding a Jewish girl in…
    ##  5 An American expat tries to sell off his highly profitable marijuana empire i…
    ##  6 A faded television actor and his stunt double strive to achieve fame and suc…
    ##  7 Jo March reflects back and forth on her life, telling the beloved story of t…
    ##  8 In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and m…
    ##  9 A detective investigates the death of a patriarch of an eccentric, combative…
    ## 10 American car designer Carroll Shelby and driver Ken Miles battle corporate i…
    ##    Runtime Genre  Rating Metascore  Votes Gross_Earning_in_Mil Director         
    ##      <dbl> <chr>   <dbl>     <dbl>  <dbl>                <dbl> <chr>            
    ##  1     135 Crime     7.7        90  83566                NA    Benny Safdie     
    ##  2     119 Drama     8.5        78 166010                NA    Sam Mendes       
    ##  3     132 Comedy    8.6        96 216718                NA    Bong Joon Ho     
    ##  4     108 Comedy    8          58 104999                 0.35 Taika Waititi    
    ##  5     113 Action    8.1        51  28099                NA    Guy Ritchie      
    ##  6     161 Comedy    7.7        83 376301               135.   Quentin Tarantino
    ##  7     135 Drama     8.1        91  54597                NA    Greta Gerwig     
    ##  8     122 Crime     8.6        59 666811               193.   Todd Phillips    
    ##  9     131 Comedy    8          82 157627                NA    Rian Johnson     
    ## 10     152 Action    8.2        81 130613                NA    James Mangold    
    ##    Actor               
    ##    <chr>               
    ##  1 Mesfin Lamengo      
    ##  2 Dean-Charles Chapman
    ##  3 Kang-ho Song        
    ##  4 Roman Griffin Davis 
    ##  5 Matthew McConaughey 
    ##  6 Leonardo DiCaprio   
    ##  7 Saoirse Ronan       
    ##  8 Joaquin Phoenix     
    ##  9 Daniel Craig        
    ## 10 Matt Damon          
    ## # … with 90 more rows
  • How many top 100 movies are in each genre? (Be careful with interpretation.)

    movies %>%
      ggplot() +
      geom_bar(mapping = aes(x = Genre))

  • Which genre is most profitable in terms of average gross earnings?

    movies %>%
      group_by(Genre) %>%
      summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm=TRUE)) %>%
      ggplot() +
        geom_col(mapping = aes(x = Genre, y = avg_earning)) + 
        labs(y = "avg earning in millions")
    ## Warning: Removed 1 rows containing missing values (position_stack).

    ggplot(data = movies) +
      geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) + 
      labs(y = "Gross earning in millions")
    ## Warning: Removed 60 rows containing non-finite values (stat_boxplot).

  • Is there a relationship between gross earning and rating? Find the best selling movie (by gross earning) in each genre

    library("ggrepel")
    (best_in_genre <- movies %>%
        group_by(Genre) %>%
        filter(row_number(desc(Gross_Earning_in_Mil)) == 1))
    ## # A tibble: 8 x 11
    ## # Groups:   Genre [8]
    ##    Rank Title Description Runtime Genre Rating Metascore  Votes Gross_Earning_i…
    ##   <int> <chr> <chr>         <dbl> <chr>  <dbl>     <dbl>  <dbl>            <dbl>
    ## 1     6 Once… A faded te…     161 Come…    7.7        83 376301            135. 
    ## 2     8 Joker In Gotham …     122 Crime    8.6        59 666811            193. 
    ## 3    15 The … After the …     118 Anim…    6.9        55 163671            540. 
    ## 4    28 Aven… After the …     181 Acti…    8.5        78 663960            858. 
    ## 5    44 Alad… A kind-hea…     128 Adve…    7          53 184442            355. 
    ## 6    51 It C… Twenty-sev…     169 Drama    6.6        58 155862            194. 
    ## 7    71 Rock… A musical …     121 Biog…    7.4        69  94514             96.4
    ## 8    74 Us    A family's…     116 Horr…    6.9        81 181696            175. 
    ## # … with 2 more variables: Director <chr>, Actor <chr>
    ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
      geom_point(mapping = aes(size = Votes, color = Genre)) + 
      ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
      labs(y = "Gross earning in millions")
    ## Warning: Removed 60 rows containing missing values (geom_point).

Example: Scraping image data from Google

Complete search operators are described at http://www.googleguide.com/advanced_operators_reference.html.

searchTerm <- "ucla"
# tbm=isch (images), app (apps), bks (books), nws (news), pts (patents), vid (videos)
# tbs=isz:m (medium images)
# <https://stenevang.wordpress.com/2013/02/22/google-advanced-power-search-url-request-parameters/>
(url <- str_c("https://www.google.com/search?q=", searchTerm,
              "&source=lnms&tbm=isch&sa=X&tbs=isz:m"))
## [1] "https://www.google.com/search?q=ucla&source=lnms&tbm=isch&sa=X&tbs=isz:m"
webpage <- read_html(url)
(imageurl <- webpage %>% html_nodes("img") %>% html_attr("src"))
##  [1] "/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif"                           
##  [2] NA                                                                                                     
##  [3] NA                                                                                                     
##  [4] NA                                                                                                     
##  [5] "http://t1.gstatic.com/images?q=tbn:ANd9GcRi4lynSHbXZ4Iw8g2dqSWIHUbwYlVAnCG8JmoJk0m5TDqv7u1A4DZXIXo"   
##  [6] "http://t2.gstatic.com/images?q=tbn:ANd9GcR6WcG06o-hL1TdUt9koGmqZ3rb6gKXdH-qzeJzFxfxlPbwyl01vgcnmT5A"  
##  [7] "http://t3.gstatic.com/images?q=tbn:ANd9GcSb0UvxFeU1jMmffJp7HqtlvJWO8YCuTDtLrL4WEXBZyjRjG_ZwJhxknho"   
##  [8] "http://t0.gstatic.com/images?q=tbn:ANd9GcQky72Pltll9gse5zuuUB5v3usGPHsUWIQHkUxOBcCkaX8FQqGZKIrfPDHO"  
##  [9] "http://t1.gstatic.com/images?q=tbn:ANd9GcSL2OVtYLGq8WkODsaQvQdhx9L4Bo77jiyw85zxiI2CIfY9QUbXd0MifA"    
## [10] "http://t2.gstatic.com/images?q=tbn:ANd9GcQgDqKCP2Uj7dmIqDKLCzwnO2Nxe4NKkxqi7yzULvq5yIvx9AM5Let8VSfF"  
## [11] "http://t3.gstatic.com/images?q=tbn:ANd9GcQY6joNaBY8I1OslMcbnT6jTExmhBUJoUYQXLGdFzVTveOzs25XO6WNlr0"   
## [12] "http://t0.gstatic.com/images?q=tbn:ANd9GcRlohKXRrTHPQ2qUYe_QbQ_4xL7INWBdJxqcJO6JUe6z1qYt1YuJ9kxW6rH"  
## [13] "http://t1.gstatic.com/images?q=tbn:ANd9GcTACCo1ZrcHrK8aemBX40uKkyecusTqd2XNBq-8Pb_iadh1RK0BnA3UsEVK"  
## [14] "http://t2.gstatic.com/images?q=tbn:ANd9GcQd5o6YyCN6E9wW0_htp0X5E1huKFsX-O0zpl_J0xuon6DaaYicZClCVTPV"  
## [15] "http://t3.gstatic.com/images?q=tbn:ANd9GcSIwWQ2gn7Ae8lSfeTuK5QGLPv4v6O1uxD_RrVuDElsHEXOZQiPvIyEcWY"   
## [16] "http://t0.gstatic.com/images?q=tbn:ANd9GcTiL2-GBIJzMc0SVtc6lr5-jPgk-UiJB67YlMCRzwFOcJUxLg5X3WPdCtFs"  
## [17] "http://t1.gstatic.com/images?q=tbn:ANd9GcTbYVc_m_Ny1g4TF3ZcjIlyeD5QmCRS2yVXbk-M9_mDEwCDWZcHbgnNEIw"   
## [18] "http://t2.gstatic.com/images?q=tbn:ANd9GcRAL3ohwQw-hYoXz2wMITlAA9sOu0IEFURoW2ofFX0lFVHxOgXmNkqnjsLE"  
## [19] "http://t3.gstatic.com/images?q=tbn:ANd9GcSy9BXU7jML2lyCXeOwLp9SKejkm1-Va1BSkYjpVtu9UbexM7N2XFUwvxY"   
## [20] "http://t0.gstatic.com/images?q=tbn:ANd9GcTrlSU-lqvX0HxLSrEjgN0qKChaFkwTxom3KIhuurBh3RRJchYq9An_MuUS"  
## [21] "http://t1.gstatic.com/images?q=tbn:ANd9GcQi3kIBDeclmkBAQCFcaEhb4gymQLbZFtrfRNzQvgtAjfOAZ4FlLb39UQo"   
## [22] "http://t2.gstatic.com/images?q=tbn:ANd9GcTqhIjDcVUTotgA5pMeGH6U_jbJBrOd3WSklyTQPwNoB-crknsFUt9eXsQo"  
## [23] "http://t3.gstatic.com/images?q=tbn:ANd9GcTbOfKoRFdGCFacUJZsQ01W-iwwVVUh4y67vFkE4Jh-3IT9g75T26b0WidqAg"
## [24] "http://t0.gstatic.com/images?q=tbn:ANd9GcSsfJWAolCg59JerAWwBCmuwAIfANga0QgBV9k7o5Ru1BmLk-EyouqRttPD"  
## [25] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_g_color_18x26dp.gif"                          
## [26] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_red_o_color_16x26dp.gif"                      
## [27] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"                   
## [28] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"                   
## [29] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"                   
## [30] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"                   
## [31] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"                   
## [32] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"                   
## [33] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"                   
## [34] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"                   
## [35] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"                   
## [36] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_gle_with_chevron_color_100x26dp.gif"          
## [37] NA                                                                                                     
## [38] NA                                                                                                     
## [39] NA

Following code still not working…

downloadImages <- function(files, brand, outPath="images"){
  for(i in 1:length(files)){
    download.file(files[i],
                  destfile = paste0(outPath, "/", brand, "_", i, ".jpg"),
                  mode = 'wb')
  }
}
downloadImages(imageurl, "ucla")
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, : URL
## '/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif':
## status was 'URL using bad/illegal format or missing URL'
## Error in download.file(files[i], destfile = paste0(outPath, "/", brand, : cannot open URL '/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif'
ls images/

Example: Scraping finance data

Example: Pull tweets into R

library(twitteR) #load package
consumer_key <- 'XXXXXXXXXX'
consumer_secret <- 'XXXXXXXXXX'
access_token <- 'XXXXXXXXXX'
access_secret <- 'XXXXXXXXXX'
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
## [1] "Using direct authentication"
virus <- searchTwitter('#China + #Coronavirus', 
                       n = 1000, 
                       since = '2020-01-01', 
                       retryOnRateLimit = 1e3)
virus_df <- as_tibble(twListToDF(virus))
virus_df %>% print(width = Inf)
## # A tibble: 1,000 x 16
##    text                                                                         
##    <chr>                                                                        
##  1 "Coronavirus in New York: What We Know So Far SEE DETAILS AT ==&gt; https://…
##  2 "RT @PaolaSCruz1990: @DeZurdaTeam @DiazCanelB @marti160patria @SecUJCuba @Fe…
##  3 "RT @HappeningNow__: I'm in shock      😱😱😱😭😭😭😭\n\n and very sad watching thi…
##  4 "RT @CirculoGloBal_I: Los Cuervos invaden  #XiNing #China #Xining #Coronavir…
##  5 "A Chinese drugmaker said it has started mass-producing an experimental drug…
##  6 "#Coronavirus ya ha cobrado mil 112 vidas y lleva más de 44 mil infectados e…
##  7 "RT @WarsontheBrink: Shocking Revelations\n\n#Chinese Billionaire \"Guo Weng…
##  8 "RT @StephenMcDonell: New official #China #coronavirus figures: definitely i…
##  9 "RT @BeholdIsrael: Israelis &amp; Chinese gathered at Tel Aviv town hall squ…
## 10 "RT @WarsontheBrink: VIDEO: #China\n\n🚨GRAPHIC:🚨\n\nALLEGED #Coronavirus pat…
##    favorited favoriteCount replyToSN created             truncated replyToSID
##    <lgl>             <dbl> <chr>     <dttm>              <lgl>     <chr>     
##  1 FALSE                 0 <NA>      2020-02-12 03:29:06 TRUE      <NA>      
##  2 FALSE                 0 <NA>      2020-02-12 03:28:54 FALSE     <NA>      
##  3 FALSE                 0 <NA>      2020-02-12 03:28:50 FALSE     <NA>      
##  4 FALSE                 0 <NA>      2020-02-12 03:28:50 FALSE     <NA>      
##  5 FALSE                 0 <NA>      2020-02-12 03:28:45 TRUE      <NA>      
##  6 FALSE                 0 <NA>      2020-02-12 03:28:45 FALSE     <NA>      
##  7 FALSE                 0 <NA>      2020-02-12 03:28:41 FALSE     <NA>      
##  8 FALSE                 0 <NA>      2020-02-12 03:28:35 FALSE     <NA>      
##  9 FALSE                 0 <NA>      2020-02-12 03:28:34 FALSE     <NA>      
## 10 FALSE                 0 <NA>      2020-02-12 03:28:22 FALSE     <NA>      
##    id                  replyToUID
##    <chr>               <chr>     
##  1 1227434427497828354 <NA>      
##  2 1227434375522004993 <NA>      
##  3 1227434360867106817 <NA>      
##  4 1227434358656708609 <NA>      
##  5 1227434338410729474 <NA>      
##  6 1227434337181818880 <NA>      
##  7 1227434321461620736 <NA>      
##  8 1227434295540752385 <NA>      
##  9 1227434291723952133 <NA>      
## 10 1227434240213733376 <NA>      
##    statusSource                                                                 
##    <chr>                                                                        
##  1 "<a href=\"http://www.akidthaine.com\" rel=\"nofollow\">Clickclickme</a>"    
##  2 "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter fo…
##  3 "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>"
##  4 "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter fo…
##  5 "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for…
##  6 "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter fo…
##  7 "<a href=\"http://sinproject.net/tweecha/\" rel=\"nofollow\">tweechaPrime</a…
##  8 "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>"
##  9 "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter fo…
## 10 "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter fo…
##    screenName      retweetCount isRetweet retweeted longitude latitude
##    <chr>                  <dbl> <lgl>     <lgl>     <lgl>     <lgl>   
##  1 bitcoinconnect             0 FALSE     FALSE     NA        NA      
##  2 MonicaR45472290           24 TRUE      FALSE     NA        NA      
##  3 ka51087963               111 TRUE      FALSE     NA        NA      
##  4 ikerpaul                  30 TRUE      FALSE     NA        NA      
##  5 Apex_WW                    0 FALSE     FALSE     NA        NA      
##  6 KarlitaTmUs                0 FALSE     FALSE     NA        NA      
##  7 sharmabrr                536 TRUE      FALSE     NA        NA      
##  8 fezziwig2019              40 TRUE      FALSE     NA        NA      
##  9 DanielWhorton             65 TRUE      FALSE     NA        NA      
## 10 jkcracker21               79 TRUE      FALSE     NA        NA      
## # … with 990 more rows

Example: Import data from Google sheets

See HW3.