Assignment 2

Assignment 2

(1)

url <- c("https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&sort=user_rating,desc&num_votes=2500,&country_of_origin=TR&count=250","https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&sort=user_rating,desc&num_votes=2500,&country_of_origin=TR&count=250")

(2)

library(tidyverse)
library(rvest)
library(stringr)
library(knitr)
library(ggplot2)


convert_time <- function(time_str) {
  hours <- 0
  minutes <- 0
  
  if (grepl("h", time_str)) {
    time_components <- strsplit(time_str, "h|m")[[1]]
    if (length(time_components) >= 1) {
      hours <- as.numeric(time_components[1])
    }
    if (length(time_components) >= 2) {
      minutes <- as.numeric(time_components[2])
    }
  } else {
    minutes <- as.numeric(gsub("m", "", time_str))
  }
  
  total_minutes <- hours * 60 + minutes
  return(total_minutes)
}


data_html <- read_html(url[1])
title_names_1 <- data_html |> html_nodes('.ipc-title__text')
title_names_1 <- html_text(title_names_1)
title_names_1 <- tail(head(title_names_1,-1),-1)
title_names_1 <- str_split(title_names_1, " ", n=2)
title_names_1 <- unlist(lapply(title_names_1, function(x) {x[2]}))

year_1 <- data_html |> html_nodes(".dli-title-metadata-item:nth-child(1)")
year_1 <- as.numeric(html_text(year_1))

duration_1 <- data_html |> html_nodes((".dli-title-metadata-item:nth-child(2)"))
duration_1 <- html_text(duration_1)
duration_1 <- unlist(lapply(duration_1, convert_time))

rating_1 <- data_html |> html_nodes(".ratingGroup--imdb-rating")
rating_1 <- html_text(rating_1)
rating_1 <- as.numeric(str_extract(rating_1, "\\d+\\.\\d+"))

votes_1 <- data_html |> html_nodes(".kRnqtn")
votes_1 <- html_text(votes_1)
votes_1 <- as.numeric(gsub("\\D", "", votes_1))


data_html <- read_html(url[2])
title_names_2 <- data_html |> html_nodes('.ipc-title__text')
title_names_2 <- html_text(title_names_2)
title_names_2 <- tail(head(title_names_2,-1),-1)
title_names_2 <- str_split(title_names_2, " ", n=2)
title_names_2 <- unlist(lapply(title_names_2, function(x) {x[2]}))

year_2 <- data_html |> html_nodes(".dli-title-metadata-item:nth-child(1)")
year_2 <- as.numeric(html_text(year_2))

duration_2 <- data_html |> html_nodes((".dli-title-metadata-item:nth-child(2)"))
duration_2 <- html_text(duration_2)
duration_2 <- unlist(lapply(duration_2, convert_time))

rating_2 <- data_html |> html_nodes(".ratingGroup--imdb-rating")
rating_2 <- html_text(rating_2)
rating_2 <- as.numeric(str_extract(rating_2, "\\d+\\.\\d+"))

votes_2 <- data_html |> html_nodes(".kRnqtn")
votes_2 <- html_text(votes_2)
votes_2 <- as.numeric(gsub("\\D", "", votes_2))

Title <- c(title_names_1,title_names_2)
Year <- c(year_1,year_2)
Duration <- c(duration_1,duration_2)
Rating <- c(rating_1,rating_2)
Votes <- c(votes_1,votes_2)

movie_data <- data.frame(
  Title = Title,
  Year = Year,
  Duration = Duration,
  Rating = Rating,
  Votes = Votes
)

(3)

(a)

movies <- movie_data %>% 
  arrange(desc(Rating))
kable(
  movies %>%
    head(5),
  caption = "Top 5 Movies",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)
Top 5 Movies
Title Year Duration Rating Votes
Hababam Sinifi 1975 87 9.2 42512
CM101MMXI Fundamentals 2013 139 9.1 46996
Tosun Pasa 1976 90 8.9 24329
Hababam Sinifi Sinifta Kaldi 1975 95 8.9 24369
Süt Kardesler 1976 80 8.8 20889

I watched all 5 movies on this list. Of course, there would be movies whose order I would change if I were to make them, but I am happy with this list (especially the first 2 rows).

kable(
  movies %>%
    tail(5),
  caption = "Bottom 5 Movies",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)
Bottom 5 Movies
Title Year Duration Rating Votes
466 Cumali Ceber 2 2018 100 1.2 10230
467 Müjde 2022 48 1.2 9919
468 15/07 Safak Vakti 2021 95 1.2 20608
469 Cumali Ceber: Allah Seni Alsin 2017 100 1.0 39269
470 Reis 2017 108 1.0 73974

I respect the opinions of the voters and have not watched any of the movies on this list. :)

(b)

kable(
  movie_data %>%
    filter(Title == "CM101MMXI Fundamentals" | Title == "Hababam Sinifi" | Title == "Masumiyet"),
  caption = "My Best Three",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)
My Best Three
Title Year Duration Rating Votes
CM101MMXI Fundamentals 2013 139 9.1 46996
Hababam Sinifi 1975 87 9.2 42512
Masumiyet 1997 110 8.1 19295

(c)

movie_data %>% 
  group_by(Year) %>%
  summarize(yearly_average = mean(Rating)) %>%
  ggplot(aes(x = Year, y = yearly_average)) + geom_point() +
  ggtitle("Yearly Rating Averages")

As you can see, the ratings of movies are decreasing over the years. Of course, it is a very difficult subject, but in short, my opinion may be that as the years go by, revenue will be prioritized rather than quality.

ggplot(movie_data, aes(x = factor(Year))) +
  geom_bar(fill = "blue", color = "black") +
  labs(x = "Year", y = "Number of Movies") +
  ggtitle("Number of Movies Over the Years") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90)
  )

Similar to my previous comment, the reason for the increase in the number of films over the years may be the fact that many short-lived films are made for the sake of revenue.

movie_data$Year <- as.factor(movie_data$Year)
ggplot(movie_data, aes(x = Year, y = Rating, fill = factor(Year))) +
  geom_boxplot(color = "black",width = 0.5) +
  labs(x = "Year", y = "Rating") +
  ggtitle("Box Plots of Movie Ratings Over the Years") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90)
  )

(d)

corr <- cor(movie_data$Votes, movie_data$Rating)
cat("Correlation between Votes and Ratings:", corr)
Correlation between Votes and Ratings: 0.130875

(e)

corr <- cor(movie_data$Duration, movie_data$Rating)
cat("Correlation between Votes and Ratings:", corr)
Correlation between Votes and Ratings: 0.03356006

(4)

movie_data1 = movie_data
url <- c("https://m.imdb.com/search/title/?title_type=feature&sort=release_date,asc&num_votes=2500,&groups=top_1000&country_of_origin=TR&count=250")
data_html <- read_html(url)
title_names <- data_html |> html_nodes('.ipc-title__text')
title_names <- html_text(title_names)
title_names <- tail(head(title_names,-1),-1)
title_names <- str_split(title_names, " ", n=2)
title_names <- unlist(lapply(title_names, function(x) {x[2]}))

year <- data_html |> html_nodes(".dli-title-metadata-item:nth-child(1)")
year <- as.numeric(html_text(year))

top_movies <- data.frame(
  Title = title_names,
  Year = year
  )
kable (
  top_movies,
  caption = "Turkish Movies in the Top 1000",
)
Turkish Movies in the Top 1000
Title Year
Eskiya 1996
Her Sey Çok Güzel Olacak 1998
Vizontele 2001
G.O.R.A. 2004
Babam ve Oglum 2005
Nefes: Vatan Sagolsun 2009
Bir Zamanlar Anadolu’da 2011
Kis Uykusu 2014
Ayla: The Daughter of War 2017
Ahlat Agaci 2018
Yedinci Kogustaki Mucize 2019
movie_data <- movie_data %>% mutate(Year = as.numeric(Year))
top_movies <- top_movies %>% mutate(Year = as.numeric(Year))

movies_top2 <- movies %>%
  inner_join(top_movies, by = c("Title", "Year")) %>%
  arrange(desc(Rating))
kable(
  movies_top2,
  caption = "Turkish Movies in the Top 1000 Full Table",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)
Turkish Movies in the Top 1000 Full Table
Title Year Duration Rating Votes
Ayla: The Daughter of War 2017 125 8.3 42992
Yedinci Kogustaki Mucize 2019 132 8.2 54171
Babam ve Oglum 2005 108 8.2 91035
Eskiya 1996 128 8.1 71704
Her Sey Çok Güzel Olacak 1998 107 8.1 27122
Kis Uykusu 2014 196 8.0 54646
Ahlat Agaci 2018 188 8.0 27015
Nefes: Vatan Sagolsun 2009 128 8.0 35022
G.O.R.A. 2004 127 8.0 66033
Vizontele 2001 110 8.0 38403
Bir Zamanlar Anadolu’da 2011 157 7.8 49365
movie_data1 <- arrange(movie_data1, desc(Rating))

kable(
  head(movie_data1, 11),
  caption = "Turkish Movies in the Top 11",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)
Turkish Movies in the Top 11
Title Year Duration Rating Votes
Hababam Sinifi 1975 87 9.2 42512
CM101MMXI Fundamentals 2013 139 9.1 46996
Tosun Pasa 1976 90 8.9 24329
Hababam Sinifi Sinifta Kaldi 1975 95 8.9 24369
Süt Kardesler 1976 80 8.8 20889
Saban Oglu Saban 1977 90 8.7 18534
Zügürt Aga 1985 101 8.7 16135
Neseli Günler 1978 95 8.7 11807
Kibar Feyzo 1978 83 8.7 17126
Hababam Sinifi Uyaniyor 1976 94 8.7 20640
Canim Kardesim 1973 85 8.6 10097
Back to top