Assignment 2

Step 2

Web scrapping to create a Data Frame with columns we need: Title, Year, Duration, Rating, Votes

knitr::opts_chunk$set(warning = FALSE)
library(rvest)
Warning: package 'rvest' was built under R version 4.3.2
library(dplyr)
Warning: package 'dplyr' was built under R version 4.3.2
library(stringr)
library(kableExtra)
Warning: package 'kableExtra' was built under R version 4.3.2
library(knitr)
Warning: package 'knitr' was built under R version 4.3.2
final_data<-data.frame()
urls <- c(
  "https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&num_votes=2500,&country_of_origin=TR&count=250",  "https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&num_votes=2500,&country_of_origin=TR&count=250"
)

final_data<-data.frame()
urls <- c(
  "https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&num_votes=2500,&country_of_origin=TR&count=250",  "https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&num_votes=2500,&country_of_origin=TR&count=250"
)
titles<-c()
years<-c()
durations<-c()
ratings<-c()
votes<-c()

for (url in urls){
  data_html<- read_html(url)

  title_names <- data_html |> html_nodes('.ipc-title__text')
  title_names <- html_text(title_names)
  title_names <- tail(head(title_names,-1),-1)
  title_names <- str_split(title_names, " ", n=2)
  title_names <- unlist(lapply(title_names, function(x) {x[2]}))
  
  titles<-c(titles,title_names)
  
  year_names <- data_html |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item')
  year_names <- html_text(year_names)
  result <- year_names[grep(" ", year_names, invert = TRUE)]
  result2 <- result[nchar(result) == 4]
  result2<-as.numeric(result2)
  years<-c(years,result2)
  
  duration_names <- data_html |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item')
  duration_names <- html_text(duration_names)
  result3 <- duration_names[grep("h|m", duration_names, invert = FALSE)]
  
  
  
  durations<-c(durations,result3)
  
  rating_names <- data_html |> html_nodes('.ipc-rating-star.ipc-rating-star--base.ipc-rating-star--imdb.sc-9ab53865-1.iXEijC.ratingGroup--imdb-rating')
  rating_names <- html_text(rating_names)
  rating_names <- substr(rating_names, 1, 3)
  rating_names <- as.numeric(rating_names)
  
  ratings<-c(ratings,rating_names)

  vote_names <- data_html %>% html_nodes(".sc-53c98e73-0.kRnqtn")
  vote_names <- html_text(vote_names)
  vote_names <- substr(vote_names, start = 6, stop = nchar(vote_names))
  vote_names <- gsub(",", "", vote_names)
  vote_names<-as.numeric(vote_names)

  votes<-c(votes,vote_names)
  
  
}
saat <- ifelse(grepl("h", durations), sub("h.*", "", durations), durations)
saat <- ifelse(is.na(saat), 0, saat)
saat <- as.numeric(saat)
Warning: Zorlamadan dolayı ortaya çıkan NAs
dakika <- ifelse(grepl("h.*m", durations), gsub(".*h(.*)m.*", "\\1", durations), "")
dakika <- as.numeric(dakika)

saat <- str_split(durations, " ")
saat <- sapply(saat, function(x) ifelse(grepl("h", x[1], fixed = TRUE), x[1], 0))
saat <- sub("h", "", saat)
saat <- as.numeric(saat)
saat <- saat * 60

dakika <- str_split(durations, " ")
dakika <- sapply(dakika, function(x) ifelse(length(x) >= 2, x[2], ifelse(grepl("m", x, fixed = TRUE), x[1], ifelse(grepl("m", x[1], fixed = TRUE), x[1],0))))
dakika <- sub("m", "", dakika)
dakika <- as.numeric(dakika)

sure <- saat + dakika
final<-data.frame(TITLE=titles, YEAR=years, DURATION=sure, RATING=ratings,VOTE=votes)

kable(head(final,10), caption = "IMDB Dataframe")
IMDB Dataframe
TITLE YEAR DURATION RATING VOTE
Kuru Otlar Üstüne 2023 197 8.1 5059
Istanbul Için Son Çagri 2023 91 5.3 7331
Yedinci Kogustaki Mucize 2019 132 8.2 54151
Ölümlü Dünya 2 2023 117 7.5 3443
Bihter 2023 113 3.6 3340
Ölümlü Dünya 2018 107 7.6 30258
Kis Uykusu 2014 196 8.0 54631
Dag II 2016 135 8.2 109866
Do Not Disturb 2023 114 6.3 8769
Ayla: The Daughter of War 2017 125 8.3 42989

3(a)

Web scrapping to create a Data Frame with columns we need: Title, Year, Duration, Rating, Votes

final2<-final[order(final$RATING, decreasing=TRUE), ]
final2$STANDING<- 1:nrow(final2)

kable(head(final2,5), caption ="1st 5 movies")
1st 5 movies
TITLE YEAR DURATION RATING VOTE STANDING
257 Hababam Sinifi 1975 87 9.2 42512 1
39 CM101MMXI Fundamentals 2013 139 9.1 46995 2
273 Tosun Pasa 1976 90 8.9 24327 3
337 Hababam Sinifi Sinifta Kaldi 1975 95 8.9 24370 4
321 Süt Kardesler 1976 80 8.8 20885 5
final3<-final[order(final$RATING, decreasing=FALSE), ]
kable(tail(final2,5), caption ="Last 5 movies")
Last 5 movies
TITLE YEAR DURATION RATING VOTE STANDING
189 Cumali Ceber 2 2018 100 1.2 10228 466
199 Müjde 2022 48 1.2 9920 467
245 15/07 Safak Vakti 2021 95 1.2 20606 468
101 Cumali Ceber: Allah Seni Alsin 2017 100 1.0 39266 469
150 Reis 2017 108 1.0 73972 470

3(b)

My 3 top movies are “Eksi Elmalar”, “Banker Bilo”, “A.R.O.G”.

kable(final2[final2$TITLE %in% c("A.R.O.G", "Banker Bilo","Eksi Elmalar"), ])
TITLE YEAR DURATION RATING VOTE STANDING
373 Banker Bilo 1980 85 7.6 7351 90
255 A.R.O.G 2008 127 7.3 44631 136
134 Eksi Elmalar 2016 114 7.1 7828 173
Back to top