sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.6.2 magrittr_1.5 tools_3.6.2 htmltools_0.4.0
## [5] yaml_2.2.0 Rcpp_1.0.3 stringi_1.4.5 rmarkdown_2.1
## [9] knitr_1.27 stringr_1.4.0 xfun_0.12 digest_0.6.23
## [13] rlang_0.4.2 evaluate_0.14
Load tidyverse and other packages for this lecture:
library("tidyverse")
## ── Attaching packages ────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("rvest")
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
##
## pluck
## The following object is masked from 'package:readr':
##
## guess_encoding
library("quantmod")
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Version 0.4-0 included new data defaults. See ?getSymbols.
There is a wealth of data on internet. How to scrape them and analyze them?
rvest is an R package written by Hadley Wickham which makes web scraping easy.
We follow instructions in a Blog by SAURAV KAUSHIK to find the most popular feature films of 2019.
Install the SelectorGadget extension for Chrome.
The 100 most popular feature films released in 2019 can be accessed at page https://www.imdb.com/search/title?count=100&release_date=2019,2019&title_type=feature.
#Loading the rvest and tidyverse package
#Specifying the url for desired website to be scraped
url <- "http://www.imdb.com/search/title?count=100&release_date=2019,2019&title_type=feature"
#Reading the HTML code from the website
(webpage <- read_html(url))
## {html_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body id="styleguide-v2" class="fixed">\n <img height="1" widt ...
Use the CSS selector to get the rankings
# Use CSS selectors to scrap the rankings section
(rank_data_html <- html_nodes(webpage, '.text-primary'))
## {xml_nodeset (100)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="lister-item-index unbold text-primary">2.</span>
## [3] <span class="lister-item-index unbold text-primary">3.</span>
## [4] <span class="lister-item-index unbold text-primary">4.</span>
## [5] <span class="lister-item-index unbold text-primary">5.</span>
## [6] <span class="lister-item-index unbold text-primary">6.</span>
## [7] <span class="lister-item-index unbold text-primary">7.</span>
## [8] <span class="lister-item-index unbold text-primary">8.</span>
## [9] <span class="lister-item-index unbold text-primary">9.</span>
## [10] <span class="lister-item-index unbold text-primary">10.</span>
## [11] <span class="lister-item-index unbold text-primary">11.</span>
## [12] <span class="lister-item-index unbold text-primary">12.</span>
## [13] <span class="lister-item-index unbold text-primary">13.</span>
## [14] <span class="lister-item-index unbold text-primary">14.</span>
## [15] <span class="lister-item-index unbold text-primary">15.</span>
## [16] <span class="lister-item-index unbold text-primary">16.</span>
## [17] <span class="lister-item-index unbold text-primary">17.</span>
## [18] <span class="lister-item-index unbold text-primary">18.</span>
## [19] <span class="lister-item-index unbold text-primary">19.</span>
## [20] <span class="lister-item-index unbold text-primary">20.</span>
## ...
# Convert the ranking data to text
(rank_data <- html_text(rank_data_html))
## [1] "1." "2." "3." "4." "5." "6." "7." "8." "9." "10."
## [11] "11." "12." "13." "14." "15." "16." "17." "18." "19." "20."
## [21] "21." "22." "23." "24." "25." "26." "27." "28." "29." "30."
## [31] "31." "32." "33." "34." "35." "36." "37." "38." "39." "40."
## [41] "41." "42." "43." "44." "45." "46." "47." "48." "49." "50."
## [51] "51." "52." "53." "54." "55." "56." "57." "58." "59." "60."
## [61] "61." "62." "63." "64." "65." "66." "67." "68." "69." "70."
## [71] "71." "72." "73." "74." "75." "76." "77." "78." "79." "80."
## [81] "81." "82." "83." "84." "85." "86." "87." "88." "89." "90."
## [91] "91." "92." "93." "94." "95." "96." "97." "98." "99." "100."
# Turn into numerical values
(rank_data <- as.integer(rank_data))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100
Use SelectorGadget to find the CSS selector .lister-item-header a
.
# Using CSS selectors to scrap the title section
(title_data_html <- html_nodes(webpage, '.lister-item-header a'))
## {xml_nodeset (100)}
## [1] <a href="/title/tt5727208/?ref_=adv_li_tt">Uncut Gems</a>
## [2] <a href="/title/tt8579674/?ref_=adv_li_tt">1917</a>
## [3] <a href="/title/tt6751668/?ref_=adv_li_tt">Parasite</a>
## [4] <a href="/title/tt2584384/?ref_=adv_li_tt">Jojo Rabbit</a>
## [5] <a href="/title/tt8367814/?ref_=adv_li_tt">The Gentlemen</a>
## [6] <a href="/title/tt7131622/?ref_=adv_li_tt">Once Upon a Time... in Hollyw ...
## [7] <a href="/title/tt3281548/?ref_=adv_li_tt">Little Women</a>
## [8] <a href="/title/tt7286456/?ref_=adv_li_tt">Joker</a>
## [9] <a href="/title/tt8946378/?ref_=adv_li_tt">Knives Out</a>
## [10] <a href="/title/tt1950186/?ref_=adv_li_tt">Ford v Ferrari</a>
## [11] <a href="/title/tt5606664/?ref_=adv_li_tt">Doctor Sleep</a>
## [12] <a href="/title/tt8688634/?ref_=adv_li_tt">21 Bridges</a>
## [13] <a href="/title/tt7984734/?ref_=adv_li_tt">The Lighthouse</a>
## [14] <a href="/title/tt1302006/?ref_=adv_li_tt">The Irishman</a>
## [15] <a href="/title/tt6105098/?ref_=adv_li_tt">The Lion King</a>
## [16] <a href="/title/tt6924650/?ref_=adv_li_tt">Midway</a>
## [17] <a href="/title/tt7653254/?ref_=adv_li_tt">Marriage Story</a>
## [18] <a href="/title/tt6450804/?ref_=adv_li_tt">Terminator: Dark Fate</a>
## [19] <a href="/title/tt3224458/?ref_=adv_li_tt">A Beautiful Day in the Neighb ...
## [20] <a href="/title/tt2527338/?ref_=adv_li_tt">Star Wars: Episode IX - The R ...
## ...
# Converting the title data to text
(title_data <- html_text(title_data_html))
## [1] "Uncut Gems"
## [2] "1917"
## [3] "Parasite"
## [4] "Jojo Rabbit"
## [5] "The Gentlemen"
## [6] "Once Upon a Time... in Hollywood"
## [7] "Little Women"
## [8] "Joker"
## [9] "Knives Out"
## [10] "Ford v Ferrari"
## [11] "Doctor Sleep"
## [12] "21 Bridges"
## [13] "The Lighthouse"
## [14] "The Irishman"
## [15] "The Lion King"
## [16] "Midway"
## [17] "Marriage Story"
## [18] "Terminator: Dark Fate"
## [19] "A Beautiful Day in the Neighborhood"
## [20] "Star Wars: Episode IX - The Rise of Skywalker"
## [21] "Alita: Battle Angel"
## [22] "Midsommar"
## [23] "Bombshell"
## [24] "The Jesus Rolls"
## [25] "Hustlers"
## [26] "Toy Story 4"
## [27] "Jumanji: The Next Level"
## [28] "Avengers: Endgame"
## [29] "Judy"
## [30] "The Lodge"
## [31] "Honey Boy"
## [32] "6 Underground"
## [33] "Ad Astra"
## [34] "Harriet"
## [35] "Zombieland: Double Tap"
## [36] "Cats"
## [37] "Just Mercy"
## [38] "Fighting with My Family"
## [39] "Richard Jewell"
## [40] "The Two Popes"
## [41] "Frozen II"
## [42] "Motherless Brooklyn"
## [43] "Come to Daddy"
## [44] "Aladdin"
## [45] "Fast & Furious Presents: Hobbs & Shaw"
## [46] "Gemini Man"
## [47] "Dark Waters"
## [48] "Color Out of Space"
## [49] "Maleficent: Mistress of Evil"
## [50] "Booksmart"
## [51] "It Chapter Two"
## [52] "Shazam!"
## [53] "The Last Full Measure"
## [54] "Last Christmas"
## [55] "John Wick: Chapter 3 - Parabellum"
## [56] "Waves"
## [57] "Jay and Silent Bob Reboot"
## [58] "Pain and Glory"
## [59] "Playing with Fire"
## [60] "Spider-Man: Far from Home"
## [61] "Queen & Slim"
## [62] "Portrait of a Lady on Fire"
## [63] "The Good Liar"
## [64] "Ready or Not"
## [65] "The King"
## [66] "Downton Abbey"
## [67] "Troop Zero"
## [68] "Rambo: Last Blood"
## [69] "Countdown"
## [70] "The Personal History of David Copperfield"
## [71] "Rocketman"
## [72] "A Hidden Life"
## [73] "Bait"
## [74] "Us"
## [75] "The Peanut Butter Falcon"
## [76] "After"
## [77] "Long Shot"
## [78] "Klaus"
## [79] "Godzilla: King of the Monsters"
## [80] "Captain Marvel"
## [81] "True History of the Kelly Gang"
## [82] "The Addams Family"
## [83] "Corpus Christi"
## [84] "Mr. Jones"
## [85] "Code 8"
## [86] "Anna"
## [87] "Men in Black: International"
## [88] "Angel Has Fallen"
## [89] "Yesterday"
## [90] "Togo"
## [91] "Good Boys"
## [92] "The Farewell"
## [93] "Guns Akimbo"
## [94] "Charlie's Angels"
## [95] "Monos"
## [96] "Hellboy"
## [97] "Ip Man 4: The Finale"
## [98] "Spies in Disguise"
## [99] "Extremely Wicked, Shockingly Evil and Vile"
## [100] "Vivarium"
# Using CSS selectors to scrap the description section
(description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
## {xml_nodeset (100)}
## [1] <p class="text-muted">\n A charismatic New York City jeweler always o ...
## [2] <p class="text-muted">\n April 6th, 1917. As a regiment assembles to ...
## [3] <p class="text-muted">\n A poor family, the Kims, con their way into ...
## [4] <p class="text-muted">\n A young boy in Hitler's army finds out his m ...
## [5] <p class="text-muted">\n An American expat tries to sell off his high ...
## [6] <p class="text-muted">\n A faded television actor and his stunt doubl ...
## [7] <p class="text-muted">\n Jo March reflects back and forth on her life ...
## [8] <p class="text-muted">\n In Gotham City, mentally troubled comedian A ...
## [9] <p class="text-muted">\n A detective investigates the death of a patr ...
## [10] <p class="text-muted">\n American car designer <a href="/name/nm07909 ...
## [11] <p class="text-muted">\n Years following the events of "The Shining," ...
## [12] <p class="text-muted">\n An embattled NYPD detective is thrust into a ...
## [13] <p class="text-muted">\n Two lighthouse keepers try to maintain their ...
## [14] <p class="text-muted">\n A mob hitman recalls his friend <a href="/na ...
## [15] <p class="text-muted">\n After the murder of his father, a young lion ...
## [16] <p class="text-muted">\n The story of the Battle of Midway, told by t ...
## [17] <p class="text-muted">\n Noah Baumbach's incisive and compassionate l ...
## [18] <p class="text-muted">\n An augmented human and Sarah Connor must sto ...
## [19] <p class="text-muted">\n Based on the true story of a real-life frien ...
## [20] <p class="text-muted">\n The surviving members of the resistance face ...
## ...
# Converting the description data to text
description_data <- html_text(description_data_html)
# take a look at first few
head(description_data)
## [1] "\n A charismatic New York City jeweler always on the lookout for the next big score makes a series of high-stakes bets that could lead to the windfall of a lifetime. Howard must perform a precarious high-wire act, balancing business, family, and encroaching adversaries on all sides in his relentless pursuit of the ultimate win."
## [2] "\n April 6th, 1917. As a regiment assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap."
## [3] "\n A poor family, the Kims, con their way into becoming the servants of a rich family, the Parks. But their easy life gets complicated when their deception is threatened with exposure."
## [4] "\n A young boy in Hitler's army finds out his mother is hiding a Jewish girl in their home."
## [5] "\n An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."
## [6] "\n A faded television actor and his stunt double strive to achieve fame and success in the film industry during the final years of Hollywood's Golden Age in 1969 Los Angeles."
# strip the '\n'
description_data <- str_replace(description_data, "^\\n\\s+", "")
head(description_data)
## [1] "A charismatic New York City jeweler always on the lookout for the next big score makes a series of high-stakes bets that could lead to the windfall of a lifetime. Howard must perform a precarious high-wire act, balancing business, family, and encroaching adversaries on all sides in his relentless pursuit of the ultimate win."
## [2] "April 6th, 1917. As a regiment assembles to wage war deep in enemy territory, two soldiers are assigned to race against time and deliver a message that will stop 1,600 men from walking straight into a deadly trap."
## [3] "A poor family, the Kims, con their way into becoming the servants of a rich family, the Parks. But their easy life gets complicated when their deception is threatened with exposure."
## [4] "A young boy in Hitler's army finds out his mother is hiding a Jewish girl in their home."
## [5] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."
## [6] "A faded television actor and his stunt double strive to achieve fame and success in the film industry during the final years of Hollywood's Golden Age in 1969 Los Angeles."
# Using CSS selectors to scrap the Movie runtime section
(runtime_data <- webpage %>%
html_nodes('.runtime') %>%
html_text() %>%
str_replace(" min", "") %>%
as.integer())
## [1] 135 119 132 108 113 161 135 122 131 152 152 99 109 209 118 138 137 128
## [19] 109 142 122 147 109 85 110 100 123 181 118 108 94 128 123 125 99 110
## [37] 137 108 131 125 103 144 96 128 137 117 126 111 119 102 169 132 116 103
## [55] 131 135 105 113 96 129 132 121 109 95 140 122 94 89 90 119 121 174
## [73] 89 116 97 105 125 96 132 123 124 86 115 141 98 119 114 121 116 113
## [91] 90 100 95 118 102 120 105 102 110 97
```r
# Using CSS selectors to scrap the Movie runtime section
runtime_data_html <- html_nodes(webpage, '.runtime')
# Converting the runtime data to text
runtime_data <- html_text(runtime_data_html)
# Let's have a look at the runtime
head(runtime_data)
```
```
## [1] "135 min" "119 min" "132 min" "108 min" "113 min" "161 min"
```
```r
# Data-Preprocessing: removing mins and converting it to numerical
runtime_data <- str_replace(runtime_data, " min", "")
runtime_data <- as.numeric(runtime_data)
#Let's have another look at the runtime data
head(runtime_data)
```
```
## [1] 135 119 132 108 113 161
```
Collect the (first) genre of each movie:
# Using CSS selectors to scrap the Movie genre section
genre_data_html <- html_nodes(webpage, '.genre')
# Converting the genre data to text
genre_data <- html_text(genre_data_html)
# Let's have a look at the genre data
head(genre_data)
## [1] "\nCrime, Drama, Thriller "
## [2] "\nDrama, War "
## [3] "\nComedy, Crime, Drama "
## [4] "\nComedy, Drama, War "
## [5] "\nAction, Crime "
## [6] "\nComedy, Drama "
# Data-Preprocessing: retrieve the first word
genre_data <- str_extract(genre_data, "[:alpha:]+")
# Convering each genre from text to factor
#genre_data <- as.factor(genre_data)
# Let's have another look at the genre data
head(genre_data)
## [1] "Crime" "Drama" "Comedy" "Comedy" "Action" "Comedy"
# Using CSS selectors to scrap the IMDB rating section
rating_data_html <- html_nodes(webpage, '.ratings-imdb-rating strong')
# Converting the ratings data to text
rating_data <- html_text(rating_data_html)
# Let's have a look at the ratings
head(rating_data)
## [1] "7.7" "8.5" "8.6" "8.0" "8.1" "7.7"
# Data-Preprocessing: converting ratings to numerical
rating_data <- as.numeric(rating_data)
# Let's have another look at the ratings data
rating_data
## [1] 7.7 8.5 8.6 8.0 8.1 7.7 8.1 8.6 8.0 8.2 7.5 6.6 7.7 8.0 6.9 6.8 8.0 6.3
## [19] 7.4 6.9 7.3 7.2 6.8 4.5 6.4 7.9 7.0 8.5 7.0 6.6 7.4 6.1 6.6 6.3 6.8 2.8
## [37] 7.5 7.1 7.5 7.6 7.1 6.9 6.2 7.0 6.5 5.7 7.6 6.3 6.7 7.2 6.6 7.1 6.5 6.5
## [55] 7.5 7.7 5.8 7.6 4.8 7.6 7.1 8.2 6.5 6.9 7.3 7.4 7.0 6.2 5.3 6.6 7.4 7.6
## [73] 7.3 6.9 7.7 5.4 6.9 8.2 6.1 6.9 5.9 5.8 7.8 7.0 6.1 6.6 5.6 6.4 6.9 8.1
## [91] 6.7 7.7 7.5 4.2 7.2 5.2 7.4 6.8 6.6 6.4
# Using CSS selectors to scrap the votes section
votes_data_html <- html_nodes(webpage, '.sort-num_votes-visible span:nth-child(2)')
# Converting the votes data to text
votes_data <- html_text(votes_data_html)
# Let's have a look at the votes data
head(votes_data)
## [1] "83,566" "166,010" "216,718" "104,999" "28,099" "376,301"
# Data-Preprocessing: removing commas
votes_data <- str_replace(votes_data, ",", "")
# Data-Preprocessing: converting votes to numerical
votes_data <- as.numeric(votes_data)
#Let's have another look at the votes data
votes_data
## [1] 83566 166010 216718 104999 28099 376301 54597 666811 157627 130613
## [11] 68685 12102 68873 235933 163671 24771 164023 90804 22450 252611
## [21] 187763 113708 28806 415 51271 152638 74662 663960 20865 1670
## [31] 7438 98131 140689 9711 80932 25935 9666 50780 16275 70005
## [41] 66365 19253 1742 184442 132024 52581 11148 4923 52000 63862
## [51] 155862 214675 1764 25895 222554 6121 9089 32392 5016 261768
## [61] 7353 14721 9909 62581 62603 26857 3121 55710 12229 2483
## [71] 94514 5392 1578 181696 32194 23500 69513 55689 107850 393541
## [81] 1132 14880 2620 1358 11212 39427 85103 50720 77311 15937
## [91] 40581 28467 1506 18344 6495 66399 4749 6450 63633 815
# Using CSS selectors to scrap the directors section
(directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm1509478/?ref_=adv_li_dr_0">Benny Safdie</a>
## [2] <a href="/name/nm0005222/?ref_=adv_li_dr_0">Sam Mendes</a>
## [3] <a href="/name/nm0094435/?ref_=adv_li_dr_0">Bong Joon Ho</a>
## [4] <a href="/name/nm0169806/?ref_=adv_li_dr_0">Taika Waititi</a>
## [5] <a href="/name/nm0005363/?ref_=adv_li_dr_0">Guy Ritchie</a>
## [6] <a href="/name/nm0000233/?ref_=adv_li_dr_0">Quentin Tarantino</a>
## [7] <a href="/name/nm1950086/?ref_=adv_li_dr_0">Greta Gerwig</a>
## [8] <a href="/name/nm0680846/?ref_=adv_li_dr_0">Todd Phillips</a>
## [9] <a href="/name/nm0426059/?ref_=adv_li_dr_0">Rian Johnson</a>
## [10] <a href="/name/nm0003506/?ref_=adv_li_dr_0">James Mangold</a>
## [11] <a href="/name/nm1093039/?ref_=adv_li_dr_0">Mike Flanagan</a>
## [12] <a href="/name/nm1047532/?ref_=adv_li_dr_0">Brian Kirk</a>
## [13] <a href="/name/nm3211470/?ref_=adv_li_dr_0">Robert Eggers</a>
## [14] <a href="/name/nm0000217/?ref_=adv_li_dr_0">Martin Scorsese</a>
## [15] <a href="/name/nm0269463/?ref_=adv_li_dr_0">Jon Favreau</a>
## [16] <a href="/name/nm0000386/?ref_=adv_li_dr_0">Roland Emmerich</a>
## [17] <a href="/name/nm0000876/?ref_=adv_li_dr_0">Noah Baumbach</a>
## [18] <a href="/name/nm1783265/?ref_=adv_li_dr_0">Tim Miller</a>
## [19] <a href="/name/nm1716636/?ref_=adv_li_dr_0">Marielle Heller</a>
## [20] <a href="/name/nm0009190/?ref_=adv_li_dr_0">J.J. Abrams</a>
## ...
# Converting the directors data to text
directors_data <- html_text(directors_data_html)
# Let's have a look at the directors data
directors_data
## [1] "Benny Safdie" "Sam Mendes" "Bong Joon Ho"
## [4] "Taika Waititi" "Guy Ritchie" "Quentin Tarantino"
## [7] "Greta Gerwig" "Todd Phillips" "Rian Johnson"
## [10] "James Mangold" "Mike Flanagan" "Brian Kirk"
## [13] "Robert Eggers" "Martin Scorsese" "Jon Favreau"
## [16] "Roland Emmerich" "Noah Baumbach" "Tim Miller"
## [19] "Marielle Heller" "J.J. Abrams" "Robert Rodriguez"
## [22] "Ari Aster" "Jay Roach" "John Turturro"
## [25] "Lorene Scafaria" "Josh Cooley" "Jake Kasdan"
## [28] "Anthony Russo" "Rupert Goold" "Severin Fiala"
## [31] "Alma Har'el" "Michael Bay" "James Gray"
## [34] "Kasi Lemmons" "Ruben Fleischer" "Tom Hooper"
## [37] "Destin Daniel Cretton" "Stephen Merchant" "Clint Eastwood"
## [40] "Fernando Meirelles" "Chris Buck" "Edward Norton"
## [43] "Ant Timpson" "Guy Ritchie" "David Leitch"
## [46] "Ang Lee" "Todd Haynes" "Richard Stanley"
## [49] "Joachim Rønning" "Olivia Wilde" "Andy Muschietti"
## [52] "David F. Sandberg" "Todd Robinson" "Paul Feig"
## [55] "Chad Stahelski" "Trey Edward Shults" "Kevin Smith"
## [58] "Pedro Almodóvar" "Andy Fickman" "Jon Watts"
## [61] "Melina Matsoukas" "Céline Sciamma" "Bill Condon"
## [64] "Matt Bettinelli-Olpin" "David Michôd" "Michael Engler"
## [67] "Bert" "Adrian Grunberg" "Justin Dec"
## [70] "Armando Iannucci" "Dexter Fletcher" "Terrence Malick"
## [73] "Mark Jenkin" "Jordan Peele" "Tyler Nilson"
## [76] "Jenny Gage" "Jonathan Levine" "Sergio Pablos"
## [79] "Michael Dougherty" "Anna Boden" "Justin Kurzel"
## [82] "Greg Tiernan" "Jan Komasa" "Agnieszka Holland"
## [85] "Jeff Chan" "Luc Besson" "F. Gary Gray"
## [88] "Ric Roman Waugh" "Danny Boyle" "Ericson Core"
## [91] "Gene Stupnitsky" "Lulu Wang" "Jason Lei Howden"
## [94] "Elizabeth Banks" "Alejandro Landes" "Neil Marshall"
## [97] "Wilson Yip" "Nick Bruno" "Joe Berlinger"
## [100] "Lorcan Finnegan"
# Using CSS selectors to scrap the actors section
(actors_data_html <- html_nodes(webpage, '.lister-item-content .ghost+ a'))
## {xml_nodeset (100)}
## [1] <a href="/name/nm11243726/?ref_=adv_li_st_0">Mesfin Lamengo</a>
## [2] <a href="/name/nm2835616/?ref_=adv_li_st_0">Dean-Charles Chapman</a>
## [3] <a href="/name/nm0814280/?ref_=adv_li_st_0">Kang-ho Song</a>
## [4] <a href="/name/nm9877392/?ref_=adv_li_st_0">Roman Griffin Davis</a>
## [5] <a href="/name/nm0000190/?ref_=adv_li_st_0">Matthew McConaughey</a>
## [6] <a href="/name/nm0000138/?ref_=adv_li_st_0">Leonardo DiCaprio</a>
## [7] <a href="/name/nm1519680/?ref_=adv_li_st_0">Saoirse Ronan</a>
## [8] <a href="/name/nm0001618/?ref_=adv_li_st_0">Joaquin Phoenix</a>
## [9] <a href="/name/nm0185819/?ref_=adv_li_st_0">Daniel Craig</a>
## [10] <a href="/name/nm0000354/?ref_=adv_li_st_0">Matt Damon</a>
## [11] <a href="/name/nm0000191/?ref_=adv_li_st_0">Ewan McGregor</a>
## [12] <a href="/name/nm1569276/?ref_=adv_li_st_0">Chadwick Boseman</a>
## [13] <a href="/name/nm1500155/?ref_=adv_li_st_0">Robert Pattinson</a>
## [14] <a href="/name/nm0000134/?ref_=adv_li_st_0">Robert De Niro</a>
## [15] <a href="/name/nm2255973/?ref_=adv_li_st_0">Donald Glover</a>
## [16] <a href="/name/nm4534098/?ref_=adv_li_st_0">Ed Skrein</a>
## [17] <a href="/name/nm3485845/?ref_=adv_li_st_0">Adam Driver</a>
## [18] <a href="/name/nm0000157/?ref_=adv_li_st_0">Linda Hamilton</a>
## [19] <a href="/name/nm0000158/?ref_=adv_li_st_0">Tom Hanks</a>
## [20] <a href="/name/nm0000402/?ref_=adv_li_st_0">Carrie Fisher</a>
## ...
# Converting the gross actors data to text
actors_data <- html_text(actors_data_html)
# Let's have a look at the actors data
head(actors_data)
## [1] "Mesfin Lamengo" "Dean-Charles Chapman" "Kang-ho Song"
## [4] "Roman Griffin Davis" "Matthew McConaughey" "Leonardo DiCaprio"
Be careful with missing data.
# Using CSS selectors to scrap the metascore section
metascore_data_html <- html_nodes(webpage, '.metascore')
# Converting the runtime data to text
metascore_data <- html_text(metascore_data_html)
# Let's have a look at the metascore
head(metascore_data)
## [1] "90 " "78 " "96 " "58 " "51 "
## [6] "83 "
# Data-Preprocessing: removing extra space in metascore
metascore_data <- str_replace(metascore_data, "\\s*$", "")
metascore_data <- as.numeric(metascore_data)
metascore_data
## [1] 90 78 96 58 51 83 91 59 82 81 59 51 83 94 55 47 93 54 80 53 53 72 64 79 84
## [26] 58 78 66 63 73 41 80 66 55 32 68 68 68 75 64 60 65 53 60 38 73 70 43 84 58
## [51] 71 52 50 73 80 46 87 24 69 74 95 55 64 62 64 59 26 31 75 69 78 84 81 70 30
## [76] 67 65 48 64 84 46 69 62 40 38 45 55 71 60 89 43 52 78 31 62 54 52
# Lets check the length of metascore data
length(metascore_data)
## [1] 97
# Visual inspection finds 24, 85, 100 don't have metascore
ms <- rep(NA, 100)
ms[-c(24, 85, 100)] <- metascore_data
(metascore_data <- ms)
## [1] 90 78 96 58 51 83 91 59 82 81 59 51 83 94 55 47 93 54 80 53 53 72 64 NA 79
## [26] 84 58 78 66 63 73 41 80 66 55 32 68 68 68 75 64 60 65 53 60 38 73 70 43 84
## [51] 58 71 52 50 73 80 46 87 24 69 74 95 55 64 62 64 59 26 31 75 69 78 84 81 70
## [76] 30 67 65 48 64 84 46 69 62 NA 40 38 45 55 71 60 89 43 52 78 31 62 54 52 NA
Be careful with missing data.
# Using CSS selectors to scrap the gross revenue section
gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
# Converting the gross revenue data to text
gross_data <- html_text(gross_data_html)
# Let's have a look at the gross data
head(gross_data)
## [1] "$0.35M" "$135.37M" "$192.73M" "$0.43M" "$540.08M" "$85.71M"
# Data-Preprocessing: removing '$' and 'M' signs
gross_data <- str_replace(gross_data, "M", "")
gross_data <- str_sub(gross_data, 2, 10)
#(gross_data <- str_extract(gross_data, "[:digit:]+.[:digit:]+"))
gross_data <- as.numeric(gross_data)
# Let's check the length of gross data
length(gross_data)
## [1] 40
# Visual inspection finds below movies don't have gross
#gs_data <- rep(NA, 100)
#gs_data[-c(1, 2, 3, 5, 61, 69, 71, 74, 78, 82, 84:87, 90)] <- gross_data
#(gross_data <- gs_data)
60 (out of 100) movies don’t have gross data yet! We need a better way to figure out missing entries.
(rank_and_gross <- webpage %>%
html_nodes('.ghost~ .text-muted+ span , .text-primary') %>%
html_text() %>%
str_replace("\\s+", "") %>%
str_replace_all("[$M]", ""))
## [1] "1." "2." "3." "4." "0.35" "5." "6." "135.37"
## [9] "7." "8." "192.73" "9." "10." "11." "12." "13."
## [17] "0.43" "14." "15." "540.08" "16." "17." "18." "19."
## [25] "20." "21." "85.71" "22." "27.33" "23." "24." "25."
## [33] "80.55" "26." "433.03" "27." "28." "858.37" "29." "30."
## [41] "31." "32." "33." "35.40" "34." "35." "26.80" "36."
## [49] "37." "38." "22.96" "39." "40." "41." "42." "43."
## [57] "44." "354.87" "45." "165.55" "46." "20.55" "47." "48."
## [65] "49." "36.95" "50." "22.68" "51." "193.77" "52." "140.37"
## [73] "53." "54." "55." "171.02" "56." "57." "58." "59."
## [81] "60." "388.53" "61." "62." "63." "64." "26.74" "65."
## [89] "66." "31.03" "67." "68." "18.87" "69." "70." "71."
## [97] "96.37" "72." "73." "74." "175.01" "75." "13.12" "76."
## [105] "12.14" "77." "30.32" "78." "79." "110.50" "80." "426.83"
## [113] "81." "82." "30.30" "83." "84." "85." "86." "7.74"
## [121] "87." "79.80" "88." "67.16" "89." "73.29" "90." "91."
## [129] "69.06" "92." "16.88" "93." "94." "95." "96." "21.90"
## [137] "97." "98." "99." "100."
isrank <- str_detect(rank_and_gross, "\\.$")
ismissing <- isrank[1:(length(rank_and_gross) - 1)] & isrank[2:(length(rank_and_gross))]
ismissing[length(ismissing)+1] <- isrank[length(isrank)]
missingpos <- as.integer(rank_and_gross[ismissing])
gs_data <- rep(NA, 100)
gs_data[-missingpos] <- gross_data
(gross_data <- gs_data)
## [1] NA NA NA 0.35 NA 135.37 NA 192.73 NA NA
## [11] NA NA 0.43 NA 540.08 NA NA NA NA NA
## [21] 85.71 27.33 NA NA 80.55 433.03 NA 858.37 NA NA
## [31] NA NA 35.40 NA 26.80 NA NA 22.96 NA NA
## [41] NA NA NA 354.87 165.55 20.55 NA NA 36.95 22.68
## [51] 193.77 140.37 NA NA 171.02 NA NA NA NA 388.53
## [61] NA NA NA 26.74 NA 31.03 NA 18.87 NA NA
## [71] 96.37 NA NA 175.01 13.12 12.14 30.32 NA 110.50 426.83
## [81] NA 30.30 NA NA NA 7.74 79.80 67.16 73.29 NA
## [91] 69.06 16.88 NA NA NA 21.90 NA NA NA NA
Following code programatically figures out missing entries for metascore.
# Use CSS selectors to scrap the rankings section
(rank_metascore_data_html <- html_nodes(webpage, '.unfavorable , .favorable , .mixed , .text-primary'))
## {xml_nodeset (197)}
## [1] <span class="lister-item-index unbold text-primary">1.</span>
## [2] <span class="metascore favorable">90 </span>
## [3] <span class="lister-item-index unbold text-primary">2.</span>
## [4] <span class="metascore favorable">78 </span>
## [5] <span class="lister-item-index unbold text-primary">3.</span>
## [6] <span class="metascore favorable">96 </span>
## [7] <span class="lister-item-index unbold text-primary">4.</span>
## [8] <span class="metascore mixed">58 </span>
## [9] <span class="lister-item-index unbold text-primary">5.</span>
## [10] <span class="metascore mixed">51 </span>
## [11] <span class="lister-item-index unbold text-primary">6.</span>
## [12] <span class="metascore favorable">83 </span>
## [13] <span class="lister-item-index unbold text-primary">7.</span>
## [14] <span class="metascore favorable">91 </span>
## [15] <span class="lister-item-index unbold text-primary">8.</span>
## [16] <span class="metascore mixed">59 </span>
## [17] <span class="lister-item-index unbold text-primary">9.</span>
## [18] <span class="metascore favorable">82 </span>
## [19] <span class="lister-item-index unbold text-primary">10.</span>
## [20] <span class="metascore favorable">81 </span>
## ...
# Convert the ranking data to text
(rank_metascore_data <- html_text(rank_metascore_data_html))
## [1] "1." "90 " "2." "78 " "3."
## [6] "96 " "4." "58 " "5." "51 "
## [11] "6." "83 " "7." "91 " "8."
## [16] "59 " "9." "82 " "10." "81 "
## [21] "11." "59 " "12." "51 " "13."
## [26] "83 " "14." "94 " "15." "55 "
## [31] "16." "47 " "17." "93 " "18."
## [36] "54 " "19." "80 " "20." "53 "
## [41] "21." "53 " "22." "72 " "23."
## [46] "64 " "24." "25." "79 " "26."
## [51] "84 " "27." "58 " "28." "78 "
## [56] "29." "66 " "30." "63 " "31."
## [61] "73 " "32." "41 " "33." "80 "
## [66] "34." "66 " "35." "55 " "36."
## [71] "32 " "37." "68 " "38." "68 "
## [76] "39." "68 " "40." "75 " "41."
## [81] "64 " "42." "60 " "43." "65 "
## [86] "44." "53 " "45." "60 " "46."
## [91] "38 " "47." "73 " "48." "70 "
## [96] "49." "43 " "50." "84 " "51."
## [101] "58 " "52." "71 " "53." "52 "
## [106] "54." "50 " "55." "73 " "56."
## [111] "80 " "57." "46 " "58." "87 "
## [116] "59." "24 " "60." "69 " "61."
## [121] "74 " "62." "95 " "63." "55 "
## [126] "64." "64 " "65." "62 " "66."
## [131] "64 " "67." "59 " "68." "26 "
## [136] "69." "31 " "70." "75 " "71."
## [141] "69 " "72." "78 " "73." "84 "
## [146] "74." "81 " "75." "70 " "76."
## [151] "30 " "77." "67 " "78." "65 "
## [156] "79." "48 " "80." "64 " "81."
## [161] "84 " "82." "46 " "83." "69 "
## [166] "84." "62 " "85." "86." "40 "
## [171] "87." "38 " "88." "45 " "89."
## [176] "55 " "90." "71 " "91." "60 "
## [181] "92." "89 " "93." "43 " "94."
## [186] "52 " "95." "78 " "96." "31 "
## [191] "97." "62 " "98." "54 " "99."
## [196] "52 " "100."
# Strip spaces
(rank_metascore_data <- str_replace(rank_metascore_data, "\\s+", ""))
## [1] "1." "90" "2." "78" "3." "96" "4." "58" "5." "51"
## [11] "6." "83" "7." "91" "8." "59" "9." "82" "10." "81"
## [21] "11." "59" "12." "51" "13." "83" "14." "94" "15." "55"
## [31] "16." "47" "17." "93" "18." "54" "19." "80" "20." "53"
## [41] "21." "53" "22." "72" "23." "64" "24." "25." "79" "26."
## [51] "84" "27." "58" "28." "78" "29." "66" "30." "63" "31."
## [61] "73" "32." "41" "33." "80" "34." "66" "35." "55" "36."
## [71] "32" "37." "68" "38." "68" "39." "68" "40." "75" "41."
## [81] "64" "42." "60" "43." "65" "44." "53" "45." "60" "46."
## [91] "38" "47." "73" "48." "70" "49." "43" "50." "84" "51."
## [101] "58" "52." "71" "53." "52" "54." "50" "55." "73" "56."
## [111] "80" "57." "46" "58." "87" "59." "24" "60." "69" "61."
## [121] "74" "62." "95" "63." "55" "64." "64" "65." "62" "66."
## [131] "64" "67." "59" "68." "26" "69." "31" "70." "75" "71."
## [141] "69" "72." "78" "73." "84" "74." "81" "75." "70" "76."
## [151] "30" "77." "67" "78." "65" "79." "48" "80." "64" "81."
## [161] "84" "82." "46" "83." "69" "84." "62" "85." "86." "40"
## [171] "87." "38" "88." "45" "89." "55" "90." "71" "91." "60"
## [181] "92." "89" "93." "43" "94." "52" "95." "78" "96." "31"
## [191] "97." "62" "98." "54" "99." "52" "100."
# a rank followed by another rank means the metascore for the 1st rank is missing
(isrank <- str_detect(rank_metascore_data, "\\.$"))
## [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [13] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [25] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [37] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE
## [49] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [61] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [73] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [85] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [97] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [109] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [121] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [133] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [145] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [157] FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [169] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [181] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
## [193] TRUE FALSE TRUE FALSE TRUE
ismissing <- isrank[1:length(rank_metascore_data)-1] &
isrank[2:length(rank_metascore_data)]
ismissing[length(ismissing)+1] <- isrank[length(isrank)]
(missingpos <- as.integer(rank_metascore_data[ismissing]))
## [1] 24 85 100
#(rank_metascore_data <- as.integer(rank_metascore_data))
You (students) should work out the code for finding missing positions for gross.
Form a tibble:
# Combining all the lists to form a data frame
movies <- tibble(Rank = rank_data,
Title = title_data,
Description = description_data,
Runtime = runtime_data,
Genre = genre_data,
Rating = rating_data,
Metascore = metascore_data,
Votes = votes_data,
Gross_Earning_in_Mil = gross_data,
Director = directors_data,
Actor = actors_data)
movies %>% print(width=Inf)
## # A tibble: 100 x 11
## Rank Title
## <int> <chr>
## 1 1 Uncut Gems
## 2 2 1917
## 3 3 Parasite
## 4 4 Jojo Rabbit
## 5 5 The Gentlemen
## 6 6 Once Upon a Time... in Hollywood
## 7 7 Little Women
## 8 8 Joker
## 9 9 Knives Out
## 10 10 Ford v Ferrari
## Description
## <chr>
## 1 A charismatic New York City jeweler always on the lookout for the next big s…
## 2 April 6th, 1917. As a regiment assembles to wage war deep in enemy territory…
## 3 A poor family, the Kims, con their way into becoming the servants of a rich …
## 4 A young boy in Hitler's army finds out his mother is hiding a Jewish girl in…
## 5 An American expat tries to sell off his highly profitable marijuana empire i…
## 6 A faded television actor and his stunt double strive to achieve fame and suc…
## 7 Jo March reflects back and forth on her life, telling the beloved story of t…
## 8 In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and m…
## 9 A detective investigates the death of a patriarch of an eccentric, combative…
## 10 American car designer Carroll Shelby and driver Ken Miles battle corporate i…
## Runtime Genre Rating Metascore Votes Gross_Earning_in_Mil Director
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 135 Crime 7.7 90 83566 NA Benny Safdie
## 2 119 Drama 8.5 78 166010 NA Sam Mendes
## 3 132 Comedy 8.6 96 216718 NA Bong Joon Ho
## 4 108 Comedy 8 58 104999 0.35 Taika Waititi
## 5 113 Action 8.1 51 28099 NA Guy Ritchie
## 6 161 Comedy 7.7 83 376301 135. Quentin Tarantino
## 7 135 Drama 8.1 91 54597 NA Greta Gerwig
## 8 122 Crime 8.6 59 666811 193. Todd Phillips
## 9 131 Comedy 8 82 157627 NA Rian Johnson
## 10 152 Action 8.2 81 130613 NA James Mangold
## Actor
## <chr>
## 1 Mesfin Lamengo
## 2 Dean-Charles Chapman
## 3 Kang-ho Song
## 4 Roman Griffin Davis
## 5 Matthew McConaughey
## 6 Leonardo DiCaprio
## 7 Saoirse Ronan
## 8 Joaquin Phoenix
## 9 Daniel Craig
## 10 Matt Damon
## # … with 90 more rows
How many top 100 movies are in each genre? (Be careful with interpretation.)
movies %>%
ggplot() +
geom_bar(mapping = aes(x = Genre))
Which genre is most profitable in terms of average gross earnings?
movies %>%
group_by(Genre) %>%
summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm=TRUE)) %>%
ggplot() +
geom_col(mapping = aes(x = Genre, y = avg_earning)) +
labs(y = "avg earning in millions")
## Warning: Removed 1 rows containing missing values (position_stack).
ggplot(data = movies) +
geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) +
labs(y = "Gross earning in millions")
## Warning: Removed 60 rows containing non-finite values (stat_boxplot).
Is there a relationship between gross earning and rating? Find the best selling movie (by gross earning) in each genre
library("ggrepel")
(best_in_genre <- movies %>%
group_by(Genre) %>%
filter(row_number(desc(Gross_Earning_in_Mil)) == 1))
## # A tibble: 8 x 11
## # Groups: Genre [8]
## Rank Title Description Runtime Genre Rating Metascore Votes Gross_Earning_i…
## <int> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 6 Once… A faded te… 161 Come… 7.7 83 376301 135.
## 2 8 Joker In Gotham … 122 Crime 8.6 59 666811 193.
## 3 15 The … After the … 118 Anim… 6.9 55 163671 540.
## 4 28 Aven… After the … 181 Acti… 8.5 78 663960 858.
## 5 44 Alad… A kind-hea… 128 Adve… 7 53 184442 355.
## 6 51 It C… Twenty-sev… 169 Drama 6.6 58 155862 194.
## 7 71 Rock… A musical … 121 Biog… 7.4 69 94514 96.4
## 8 74 Us A family's… 116 Horr… 6.9 81 181696 175.
## # … with 2 more variables: Director <chr>, Actor <chr>
ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
geom_point(mapping = aes(size = Votes, color = Genre)) +
ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
labs(y = "Gross earning in millions")
## Warning: Removed 60 rows containing missing values (geom_point).
Complete search operators are described at http://www.googleguide.com/advanced_operators_reference.html.
searchTerm <- "ucla"
# tbm=isch (images), app (apps), bks (books), nws (news), pts (patents), vid (videos)
# tbs=isz:m (medium images)
# <https://stenevang.wordpress.com/2013/02/22/google-advanced-power-search-url-request-parameters/>
(url <- str_c("https://www.google.com/search?q=", searchTerm,
"&source=lnms&tbm=isch&sa=X&tbs=isz:m"))
## [1] "https://www.google.com/search?q=ucla&source=lnms&tbm=isch&sa=X&tbs=isz:m"
webpage <- read_html(url)
(imageurl <- webpage %>% html_nodes("img") %>% html_attr("src"))
## [1] "/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif"
## [2] NA
## [3] NA
## [4] NA
## [5] "http://t1.gstatic.com/images?q=tbn:ANd9GcRi4lynSHbXZ4Iw8g2dqSWIHUbwYlVAnCG8JmoJk0m5TDqv7u1A4DZXIXo"
## [6] "http://t2.gstatic.com/images?q=tbn:ANd9GcR6WcG06o-hL1TdUt9koGmqZ3rb6gKXdH-qzeJzFxfxlPbwyl01vgcnmT5A"
## [7] "http://t3.gstatic.com/images?q=tbn:ANd9GcSb0UvxFeU1jMmffJp7HqtlvJWO8YCuTDtLrL4WEXBZyjRjG_ZwJhxknho"
## [8] "http://t0.gstatic.com/images?q=tbn:ANd9GcQky72Pltll9gse5zuuUB5v3usGPHsUWIQHkUxOBcCkaX8FQqGZKIrfPDHO"
## [9] "http://t1.gstatic.com/images?q=tbn:ANd9GcSL2OVtYLGq8WkODsaQvQdhx9L4Bo77jiyw85zxiI2CIfY9QUbXd0MifA"
## [10] "http://t2.gstatic.com/images?q=tbn:ANd9GcQgDqKCP2Uj7dmIqDKLCzwnO2Nxe4NKkxqi7yzULvq5yIvx9AM5Let8VSfF"
## [11] "http://t3.gstatic.com/images?q=tbn:ANd9GcQY6joNaBY8I1OslMcbnT6jTExmhBUJoUYQXLGdFzVTveOzs25XO6WNlr0"
## [12] "http://t0.gstatic.com/images?q=tbn:ANd9GcRlohKXRrTHPQ2qUYe_QbQ_4xL7INWBdJxqcJO6JUe6z1qYt1YuJ9kxW6rH"
## [13] "http://t1.gstatic.com/images?q=tbn:ANd9GcTACCo1ZrcHrK8aemBX40uKkyecusTqd2XNBq-8Pb_iadh1RK0BnA3UsEVK"
## [14] "http://t2.gstatic.com/images?q=tbn:ANd9GcQd5o6YyCN6E9wW0_htp0X5E1huKFsX-O0zpl_J0xuon6DaaYicZClCVTPV"
## [15] "http://t3.gstatic.com/images?q=tbn:ANd9GcSIwWQ2gn7Ae8lSfeTuK5QGLPv4v6O1uxD_RrVuDElsHEXOZQiPvIyEcWY"
## [16] "http://t0.gstatic.com/images?q=tbn:ANd9GcTiL2-GBIJzMc0SVtc6lr5-jPgk-UiJB67YlMCRzwFOcJUxLg5X3WPdCtFs"
## [17] "http://t1.gstatic.com/images?q=tbn:ANd9GcTbYVc_m_Ny1g4TF3ZcjIlyeD5QmCRS2yVXbk-M9_mDEwCDWZcHbgnNEIw"
## [18] "http://t2.gstatic.com/images?q=tbn:ANd9GcRAL3ohwQw-hYoXz2wMITlAA9sOu0IEFURoW2ofFX0lFVHxOgXmNkqnjsLE"
## [19] "http://t3.gstatic.com/images?q=tbn:ANd9GcSy9BXU7jML2lyCXeOwLp9SKejkm1-Va1BSkYjpVtu9UbexM7N2XFUwvxY"
## [20] "http://t0.gstatic.com/images?q=tbn:ANd9GcTrlSU-lqvX0HxLSrEjgN0qKChaFkwTxom3KIhuurBh3RRJchYq9An_MuUS"
## [21] "http://t1.gstatic.com/images?q=tbn:ANd9GcQi3kIBDeclmkBAQCFcaEhb4gymQLbZFtrfRNzQvgtAjfOAZ4FlLb39UQo"
## [22] "http://t2.gstatic.com/images?q=tbn:ANd9GcTqhIjDcVUTotgA5pMeGH6U_jbJBrOd3WSklyTQPwNoB-crknsFUt9eXsQo"
## [23] "http://t3.gstatic.com/images?q=tbn:ANd9GcTbOfKoRFdGCFacUJZsQ01W-iwwVVUh4y67vFkE4Jh-3IT9g75T26b0WidqAg"
## [24] "http://t0.gstatic.com/images?q=tbn:ANd9GcSsfJWAolCg59JerAWwBCmuwAIfANga0QgBV9k7o5Ru1BmLk-EyouqRttPD"
## [25] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_g_color_18x26dp.gif"
## [26] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_red_o_color_16x26dp.gif"
## [27] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"
## [28] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"
## [29] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"
## [30] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"
## [31] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"
## [32] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"
## [33] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"
## [34] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"
## [35] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_orange_o_color_16x26dp.gif"
## [36] "/images/branding/searchlogo/1x/googlelogo_tier_3_footer_gle_with_chevron_color_100x26dp.gif"
## [37] NA
## [38] NA
## [39] NA
Following code still not working…
downloadImages <- function(files, brand, outPath="images"){
for(i in 1:length(files)){
download.file(files[i],
destfile = paste0(outPath, "/", brand, "_", i, ".jpg"),
mode = 'wb')
}
}
downloadImages(imageurl, "ucla")
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, : URL
## '/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif':
## status was 'URL using bad/illegal format or missing URL'
## Error in download.file(files[i], destfile = paste0(outPath, "/", brand, : cannot open URL '/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif'
ls images/
quantmod
package contains many utility functions for retrieving and plotting finance data. E.g.,
library(quantmod)
stock <- getSymbols("GOOG", src = "yahoo", auto.assign = FALSE, from = "2010-02-11")
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## Warning: 'indexClass<-' is deprecated.
## Use 'tclass<-' instead.
## See help("Deprecated") and help("xts-deprecated").
head(stock)
## GOOG.Open GOOG.High GOOG.Low GOOG.Close GOOG.Volume GOOG.Adjusted
## 2010-02-11 265.6642 269.2358 263.7614 267.1985 4838000 267.1985
## 2010-02-12 265.4899 267.5721 264.2595 265.5646 4576400 265.5646
## 2010-02-16 267.4326 271.0490 266.1524 269.6393 7336100 269.6393
## 2010-02-17 269.9880 270.6854 267.8012 268.1001 4074500 268.1001
## 2010-02-18 267.7664 271.4874 267.0690 270.5957 4691200 270.5957
## 2010-02-19 269.2558 270.9992 268.8423 269.3703 5125300 269.3703
chartSeries(stock, theme = chartTheme("white"),
type = "line", log.scale = FALSE, TA = NULL)
## Warning: 'indexClass<-' is deprecated.
## Use 'tclass<-' instead.
## See help("Deprecated") and help("xts-deprecated").
Read blog: https://towardsdatascience.com/pulling-tweets-into-r-e17d4981cfe2
twitteR
package is useful for pulling tweets text data into R.
library(twitteR) #load package
Step 1: apply for a Twitter developer account. It takes some time to get approved.
consumer_key <- 'XXXXXXXXXX'
consumer_secret <- 'XXXXXXXXXX'
access_token <- 'XXXXXXXXXX'
access_secret <- 'XXXXXXXXXX'
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
## [1] "Using direct authentication"
virus <- searchTwitter('#China + #Coronavirus',
n = 1000,
since = '2020-01-01',
retryOnRateLimit = 1e3)
virus_df <- as_tibble(twListToDF(virus))
virus_df %>% print(width = Inf)
## # A tibble: 1,000 x 16
## text
## <chr>
## 1 "Coronavirus in New York: What We Know So Far SEE DETAILS AT ==> https://…
## 2 "RT @PaolaSCruz1990: @DeZurdaTeam @DiazCanelB @marti160patria @SecUJCuba @Fe…
## 3 "RT @HappeningNow__: I'm in shock 😱😱😱😭😭😭😭\n\n and very sad watching thi…
## 4 "RT @CirculoGloBal_I: Los Cuervos invaden #XiNing #China #Xining #Coronavir…
## 5 "A Chinese drugmaker said it has started mass-producing an experimental drug…
## 6 "#Coronavirus ya ha cobrado mil 112 vidas y lleva más de 44 mil infectados e…
## 7 "RT @WarsontheBrink: Shocking Revelations\n\n#Chinese Billionaire \"Guo Weng…
## 8 "RT @StephenMcDonell: New official #China #coronavirus figures: definitely i…
## 9 "RT @BeholdIsrael: Israelis & Chinese gathered at Tel Aviv town hall squ…
## 10 "RT @WarsontheBrink: VIDEO: #China\n\n🚨GRAPHIC:🚨\n\nALLEGED #Coronavirus pat…
## favorited favoriteCount replyToSN created truncated replyToSID
## <lgl> <dbl> <chr> <dttm> <lgl> <chr>
## 1 FALSE 0 <NA> 2020-02-12 03:29:06 TRUE <NA>
## 2 FALSE 0 <NA> 2020-02-12 03:28:54 FALSE <NA>
## 3 FALSE 0 <NA> 2020-02-12 03:28:50 FALSE <NA>
## 4 FALSE 0 <NA> 2020-02-12 03:28:50 FALSE <NA>
## 5 FALSE 0 <NA> 2020-02-12 03:28:45 TRUE <NA>
## 6 FALSE 0 <NA> 2020-02-12 03:28:45 FALSE <NA>
## 7 FALSE 0 <NA> 2020-02-12 03:28:41 FALSE <NA>
## 8 FALSE 0 <NA> 2020-02-12 03:28:35 FALSE <NA>
## 9 FALSE 0 <NA> 2020-02-12 03:28:34 FALSE <NA>
## 10 FALSE 0 <NA> 2020-02-12 03:28:22 FALSE <NA>
## id replyToUID
## <chr> <chr>
## 1 1227434427497828354 <NA>
## 2 1227434375522004993 <NA>
## 3 1227434360867106817 <NA>
## 4 1227434358656708609 <NA>
## 5 1227434338410729474 <NA>
## 6 1227434337181818880 <NA>
## 7 1227434321461620736 <NA>
## 8 1227434295540752385 <NA>
## 9 1227434291723952133 <NA>
## 10 1227434240213733376 <NA>
## statusSource
## <chr>
## 1 "<a href=\"http://www.akidthaine.com\" rel=\"nofollow\">Clickclickme</a>"
## 2 "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter fo…
## 3 "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>"
## 4 "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter fo…
## 5 "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for…
## 6 "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter fo…
## 7 "<a href=\"http://sinproject.net/tweecha/\" rel=\"nofollow\">tweechaPrime</a…
## 8 "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>"
## 9 "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter fo…
## 10 "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter fo…
## screenName retweetCount isRetweet retweeted longitude latitude
## <chr> <dbl> <lgl> <lgl> <lgl> <lgl>
## 1 bitcoinconnect 0 FALSE FALSE NA NA
## 2 MonicaR45472290 24 TRUE FALSE NA NA
## 3 ka51087963 111 TRUE FALSE NA NA
## 4 ikerpaul 30 TRUE FALSE NA NA
## 5 Apex_WW 0 FALSE FALSE NA NA
## 6 KarlitaTmUs 0 FALSE FALSE NA NA
## 7 sharmabrr 536 TRUE FALSE NA NA
## 8 fezziwig2019 40 TRUE FALSE NA NA
## 9 DanielWhorton 65 TRUE FALSE NA NA
## 10 jkcracker21 79 TRUE FALSE NA NA
## # … with 990 more rows
See HW3.