Assignment 2

Answers of the Question 1 and 2

Help was received from AI in some parts, especially in minute conversion.

Click the see the code

library(tidyverse)
library(rvest)
library(stringr)
library(knitr)


link_1 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&num_votes=2500,&country_of_origin=TR&count=250"
link_2 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&num_votes=2500,&country_of_origin=TR&count=250"

linked <- c(link_1, link_2)

convert_to_minutes <- function(duration_i) {
  hours <- as.integer(str_extract(duration_i, "\\d+(?=h)"))  # Saatleri al
  minutes <- as.integer(str_extract(duration_i, "\\d+(?=m)"))  # Dakikaları al
  
  total_minutes <- ifelse(!is.na(hours) && !is.na(minutes), hours * 60 + minutes,
                          ifelse(!is.na(hours), hours * 60,
                                 ifelse(!is.na(minutes), minutes,
                                        NA)))
  
  return(ifelse(!is.na(total_minutes), as.character(total_minutes), ""))
}

movie_data <- data.frame(
  Title = character(),
  Year = integer(),
  Duration = numeric(),
  Rating = numeric(),
  Votes = numeric(),
  stringsAsFactors = FALSE)

for (url in linked) {

start_P=read_html(url)
title_names <- start_P |> html_nodes('.ipc-title__text')
title_names <- html_text(title_names)
title_names <- tail(head(title_names,-1),-1)
title_names <- str_split(title_names, " ", n=2)
title_names <- unlist(lapply(title_names, function(x) {x[2]}))


years <- start_P |> html_nodes('.dli-title-metadata-item:nth-child(1)')
years <- html_text(years)
years <- substring(years,1,4) %>% as.numeric(years)

rating_data_html <- start_P %>% html_nodes('.ratingGroup--imdb-rating')
rating_data <- html_text(rating_data_html)
rating_data <- str_extract(rating_data, "\\d+\\.\\d+")
rating_data <- as.numeric(rating_data)

duration_i <- start_P %>%
html_nodes('.dli-title-metadata-item:nth-child(2)') %>%
html_text()
converted_durations <- sapply(duration_i, convert_to_minutes)
converted_durations <- as.numeric(converted_durations[!converted_durations == ""])

vote<- start_P %>% html_nodes('.cyGaqI')
vote <- html_text(vote)
vote <- gsub("Votes","", vote)
vote <- gsub(",","", vote)
vote <- as.numeric(vote)

current_data <- data.frame(Title = title_names, Year = years, Duration = converted_durations, Rating = rating_data,Votes = vote)
if (any(is.na(current_data))) {
  warning("Skipping data with missing values.")
  next
}
movie_data <- bind_rows(movie_data, current_data)
}

head(movie_data)

                     Title Year Duration Rating Votes
1        Kuru Otlar Üstüne 2023      197    8.1  5698
2                   Siccîn 2014       96    6.0  5128
3  Istanbul Için Son Çagri 2023       91    5.3  7873
4 Yedinci Kogustaki Mucize 2019      132    8.2 54445
5                   Bihter 2023      113    3.6  3490
6                 Siccin 2 2015       93    6.1  3754

Question 3

(a)

Click the see the code

sort<- movie_data %>% arrange(desc(Rating))
top_5<-head(sort,5)
bottom_5 <- tail(sort,5)
print(top_5)

                         Title Year Duration Rating Votes
1               Hababam Sinifi 1975       87    9.2 42594
2       CM101MMXI Fundamentals 2013      139    9.1 47043
3                   Tosun Pasa 1976       90    8.9 24371
4 Hababam Sinifi Sinifta Kaldi 1975       95    8.9 24412
5                Süt Kardesler 1976       80    8.8 20917

Click the see the code

print(bottom_5)

                             Title Year Duration Rating Votes
467                 Cumali Ceber 2 2018      100    1.2 10253
468                          Müjde 2022       48    1.2  9940
469              15/07 Safak Vakti 2021       95    1.2 20612
470                           Reis 2017      108    1.0 73988
471 Cumali Ceber: Allah Seni Alsin 2017      100    1.0 39272

Among the movies in the Top_5, I watched other movies except Cem Yılmaz’s movie. I was surprised that CM101MMXI Fundamentals was ranked in the top 5 rather than second, but I think the ranking of the other 4 movies is correct.

(b)

Click the see the code

selected<- movie_data %>%
  filter(str_to_lower(Title) %in% c("hababam sinifi", "g.o.r.a."))

print(selected)

           Title Year Duration Rating Votes
1 Hababam Sinifi 1975       87    9.2 42594
2       G.O.R.A. 2004      127    8.0 66197

Both films were loved by a large audience in their time and are cult films.

(c)

I didn’t have chance to completed.

(d)

Click the see the code

ggplot(movie_data, aes(x = Votes, y = Rating)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Correlation between Votes and Ratings",
       x = "Votes",
       y = "Ratings")

Click the see the code

cor(movie_data$Votes, movie_data$Rating)

[1] 0.1318886

There is positive correlation between ratins and votes but not a strong correlation, its relatively week because the value of the correlation is 0.1318877.

(e)

Click the see the code

ggplot(movie_data, aes(x = Duration, y = Rating)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "orange") +
  labs(title = "Correlation between Votes and Ratings",
       x = "Votes",
       y = "Ratings")

Click the see the code

cor(movie_data$Duration, movie_data$Rating)

[1] 0.03303303

There is a week positive correlation between ratings and durations because the value of the correlation is 0.03303303