Step 2
Web scrapping to create a Data Frame with columns we need: Title, Year, Duration, Rating, Votes
knitr::opts_chunk$set(warning = FALSE)
library(rvest)
Warning: package 'rvest' was built under R version 4.3.2
Warning: package 'dplyr' was built under R version 4.3.2
library(stringr)
library(kableExtra)
Warning: package 'kableExtra' was built under R version 4.3.2
Warning: package 'knitr' was built under R version 4.3.2
final_data<-data.frame()
urls <- c(
"https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&num_votes=2500,&country_of_origin=TR&count=250", "https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&num_votes=2500,&country_of_origin=TR&count=250"
)
final_data<-data.frame()
urls <- c(
"https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&num_votes=2500,&country_of_origin=TR&count=250", "https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&num_votes=2500,&country_of_origin=TR&count=250"
)
titles<-c()
years<-c()
durations<-c()
ratings<-c()
votes<-c()
for (url in urls){
data_html<- read_html(url)
title_names <- data_html |> html_nodes('.ipc-title__text')
title_names <- html_text(title_names)
title_names <- tail(head(title_names,-1),-1)
title_names <- str_split(title_names, " ", n=2)
title_names <- unlist(lapply(title_names, function(x) {x[2]}))
titles<-c(titles,title_names)
year_names <- data_html |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item')
year_names <- html_text(year_names)
result <- year_names[grep(" ", year_names, invert = TRUE)]
result2 <- result[nchar(result) == 4]
result2<-as.numeric(result2)
years<-c(years,result2)
duration_names <- data_html |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item')
duration_names <- html_text(duration_names)
result3 <- duration_names[grep("h|m", duration_names, invert = FALSE)]
durations<-c(durations,result3)
rating_names <- data_html |> html_nodes('.ipc-rating-star.ipc-rating-star--base.ipc-rating-star--imdb.sc-9ab53865-1.iXEijC.ratingGroup--imdb-rating')
rating_names <- html_text(rating_names)
rating_names <- substr(rating_names, 1, 3)
rating_names <- as.numeric(rating_names)
ratings<-c(ratings,rating_names)
vote_names <- data_html %>% html_nodes(".sc-53c98e73-0.kRnqtn")
vote_names <- html_text(vote_names)
vote_names <- substr(vote_names, start = 6, stop = nchar(vote_names))
vote_names <- gsub(",", "", vote_names)
vote_names<-as.numeric(vote_names)
votes<-c(votes,vote_names)
}
saat <- ifelse(grepl("h", durations), sub("h.*", "", durations), durations)
saat <- ifelse(is.na(saat), 0, saat)
saat <- as.numeric(saat)
Warning: Zorlamadan dolayı ortaya çıkan NAs
dakika <- ifelse(grepl("h.*m", durations), gsub(".*h(.*)m.*", "\\1", durations), "")
dakika <- as.numeric(dakika)
saat <- str_split(durations, " ")
saat <- sapply(saat, function(x) ifelse(grepl("h", x[1], fixed = TRUE), x[1], 0))
saat <- sub("h", "", saat)
saat <- as.numeric(saat)
saat <- saat * 60
dakika <- str_split(durations, " ")
dakika <- sapply(dakika, function(x) ifelse(length(x) >= 2, x[2], ifelse(grepl("m", x, fixed = TRUE), x[1], ifelse(grepl("m", x[1], fixed = TRUE), x[1],0))))
dakika <- sub("m", "", dakika)
dakika <- as.numeric(dakika)
sure <- saat + dakika
final<-data.frame(TITLE=titles, YEAR=years, DURATION=sure, RATING=ratings,VOTE=votes)
kable(head(final,10), caption = "IMDB Dataframe")
IMDB Dataframe
Kuru Otlar Üstüne |
2023 |
197 |
8.1 |
5059 |
Istanbul Için Son Çagri |
2023 |
91 |
5.3 |
7331 |
Yedinci Kogustaki Mucize |
2019 |
132 |
8.2 |
54151 |
Ölümlü Dünya 2 |
2023 |
117 |
7.5 |
3443 |
Bihter |
2023 |
113 |
3.6 |
3340 |
Ölümlü Dünya |
2018 |
107 |
7.6 |
30258 |
Kis Uykusu |
2014 |
196 |
8.0 |
54631 |
Dag II |
2016 |
135 |
8.2 |
109866 |
Do Not Disturb |
2023 |
114 |
6.3 |
8769 |
Ayla: The Daughter of War |
2017 |
125 |
8.3 |
42989 |
3(a)
Web scrapping to create a Data Frame with columns we need: Title, Year, Duration, Rating, Votes
final2<-final[order(final$RATING, decreasing=TRUE), ]
final2$STANDING<- 1:nrow(final2)
kable(head(final2,5), caption ="1st 5 movies")
1st 5 movies
257 |
Hababam Sinifi |
1975 |
87 |
9.2 |
42512 |
1 |
39 |
CM101MMXI Fundamentals |
2013 |
139 |
9.1 |
46995 |
2 |
273 |
Tosun Pasa |
1976 |
90 |
8.9 |
24327 |
3 |
337 |
Hababam Sinifi Sinifta Kaldi |
1975 |
95 |
8.9 |
24370 |
4 |
321 |
Süt Kardesler |
1976 |
80 |
8.8 |
20885 |
5 |
final3<-final[order(final$RATING, decreasing=FALSE), ]
kable(tail(final2,5), caption ="Last 5 movies")
Last 5 movies
189 |
Cumali Ceber 2 |
2018 |
100 |
1.2 |
10228 |
466 |
199 |
Müjde |
2022 |
48 |
1.2 |
9920 |
467 |
245 |
15/07 Safak Vakti |
2021 |
95 |
1.2 |
20606 |
468 |
101 |
Cumali Ceber: Allah Seni Alsin |
2017 |
100 |
1.0 |
39266 |
469 |
150 |
Reis |
2017 |
108 |
1.0 |
73972 |
470 |
3(b)
My 3 top movies are “Eksi Elmalar”, “Banker Bilo”, “A.R.O.G”.
kable(final2[final2$TITLE %in% c("A.R.O.G", "Banker Bilo","Eksi Elmalar"), ])
373 |
Banker Bilo |
1980 |
85 |
7.6 |
7351 |
90 |
255 |
A.R.O.G |
2008 |
127 |
7.3 |
44631 |
136 |
134 |
Eksi Elmalar |
2016 |
114 |
7.1 |
7828 |
173 |
Back to top