url <- c("https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&sort=user_rating,desc&num_votes=2500,&country_of_origin=TR&count=250","https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&sort=user_rating,desc&num_votes=2500,&country_of_origin=TR&count=250")Assignment 2
Assignment 2
(1)
(2)
library(tidyverse)
library(rvest)
library(stringr)
library(knitr)
library(ggplot2)
convert_time <- function(time_str) {
hours <- 0
minutes <- 0
if (grepl("h", time_str)) {
time_components <- strsplit(time_str, "h|m")[[1]]
if (length(time_components) >= 1) {
hours <- as.numeric(time_components[1])
}
if (length(time_components) >= 2) {
minutes <- as.numeric(time_components[2])
}
} else {
minutes <- as.numeric(gsub("m", "", time_str))
}
total_minutes <- hours * 60 + minutes
return(total_minutes)
}
data_html <- read_html(url[1])
title_names_1 <- data_html |> html_nodes('.ipc-title__text')
title_names_1 <- html_text(title_names_1)
title_names_1 <- tail(head(title_names_1,-1),-1)
title_names_1 <- str_split(title_names_1, " ", n=2)
title_names_1 <- unlist(lapply(title_names_1, function(x) {x[2]}))
year_1 <- data_html |> html_nodes(".dli-title-metadata-item:nth-child(1)")
year_1 <- as.numeric(html_text(year_1))
duration_1 <- data_html |> html_nodes((".dli-title-metadata-item:nth-child(2)"))
duration_1 <- html_text(duration_1)
duration_1 <- unlist(lapply(duration_1, convert_time))
rating_1 <- data_html |> html_nodes(".ratingGroup--imdb-rating")
rating_1 <- html_text(rating_1)
rating_1 <- as.numeric(str_extract(rating_1, "\\d+\\.\\d+"))
votes_1 <- data_html |> html_nodes(".kRnqtn")
votes_1 <- html_text(votes_1)
votes_1 <- as.numeric(gsub("\\D", "", votes_1))
data_html <- read_html(url[2])
title_names_2 <- data_html |> html_nodes('.ipc-title__text')
title_names_2 <- html_text(title_names_2)
title_names_2 <- tail(head(title_names_2,-1),-1)
title_names_2 <- str_split(title_names_2, " ", n=2)
title_names_2 <- unlist(lapply(title_names_2, function(x) {x[2]}))
year_2 <- data_html |> html_nodes(".dli-title-metadata-item:nth-child(1)")
year_2 <- as.numeric(html_text(year_2))
duration_2 <- data_html |> html_nodes((".dli-title-metadata-item:nth-child(2)"))
duration_2 <- html_text(duration_2)
duration_2 <- unlist(lapply(duration_2, convert_time))
rating_2 <- data_html |> html_nodes(".ratingGroup--imdb-rating")
rating_2 <- html_text(rating_2)
rating_2 <- as.numeric(str_extract(rating_2, "\\d+\\.\\d+"))
votes_2 <- data_html |> html_nodes(".kRnqtn")
votes_2 <- html_text(votes_2)
votes_2 <- as.numeric(gsub("\\D", "", votes_2))
Title <- c(title_names_1,title_names_2)
Year <- c(year_1,year_2)
Duration <- c(duration_1,duration_2)
Rating <- c(rating_1,rating_2)
Votes <- c(votes_1,votes_2)
movie_data <- data.frame(
Title = Title,
Year = Year,
Duration = Duration,
Rating = Rating,
Votes = Votes
)(3)
(a)
movies <- movie_data %>%
arrange(desc(Rating))kable(
movies %>%
head(5),
caption = "Top 5 Movies",
col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)| Title | Year | Duration | Rating | Votes |
|---|---|---|---|---|
| Hababam Sinifi | 1975 | 87 | 9.2 | 42512 |
| CM101MMXI Fundamentals | 2013 | 139 | 9.1 | 46996 |
| Tosun Pasa | 1976 | 90 | 8.9 | 24329 |
| Hababam Sinifi Sinifta Kaldi | 1975 | 95 | 8.9 | 24369 |
| Süt Kardesler | 1976 | 80 | 8.8 | 20889 |
I watched all 5 movies on this list. Of course, there would be movies whose order I would change if I were to make them, but I am happy with this list (especially the first 2 rows).
kable(
movies %>%
tail(5),
caption = "Bottom 5 Movies",
col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)| Title | Year | Duration | Rating | Votes | |
|---|---|---|---|---|---|
| 466 | Cumali Ceber 2 | 2018 | 100 | 1.2 | 10230 |
| 467 | Müjde | 2022 | 48 | 1.2 | 9919 |
| 468 | 15/07 Safak Vakti | 2021 | 95 | 1.2 | 20608 |
| 469 | Cumali Ceber: Allah Seni Alsin | 2017 | 100 | 1.0 | 39269 |
| 470 | Reis | 2017 | 108 | 1.0 | 73974 |
I respect the opinions of the voters and have not watched any of the movies on this list. :)
(b)
kable(
movie_data %>%
filter(Title == "CM101MMXI Fundamentals" | Title == "Hababam Sinifi" | Title == "Masumiyet"),
caption = "My Best Three",
col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)| Title | Year | Duration | Rating | Votes |
|---|---|---|---|---|
| CM101MMXI Fundamentals | 2013 | 139 | 9.1 | 46996 |
| Hababam Sinifi | 1975 | 87 | 9.2 | 42512 |
| Masumiyet | 1997 | 110 | 8.1 | 19295 |
(c)
movie_data %>%
group_by(Year) %>%
summarize(yearly_average = mean(Rating)) %>%
ggplot(aes(x = Year, y = yearly_average)) + geom_point() +
ggtitle("Yearly Rating Averages")
As you can see, the ratings of movies are decreasing over the years. Of course, it is a very difficult subject, but in short, my opinion may be that as the years go by, revenue will be prioritized rather than quality.
ggplot(movie_data, aes(x = factor(Year))) +
geom_bar(fill = "blue", color = "black") +
labs(x = "Year", y = "Number of Movies") +
ggtitle("Number of Movies Over the Years") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90)
)
Similar to my previous comment, the reason for the increase in the number of films over the years may be the fact that many short-lived films are made for the sake of revenue.
movie_data$Year <- as.factor(movie_data$Year)
ggplot(movie_data, aes(x = Year, y = Rating, fill = factor(Year))) +
geom_boxplot(color = "black",width = 0.5) +
labs(x = "Year", y = "Rating") +
ggtitle("Box Plots of Movie Ratings Over the Years") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90)
)
(d)
corr <- cor(movie_data$Votes, movie_data$Rating)
cat("Correlation between Votes and Ratings:", corr)Correlation between Votes and Ratings: 0.130875
(e)
corr <- cor(movie_data$Duration, movie_data$Rating)
cat("Correlation between Votes and Ratings:", corr)Correlation between Votes and Ratings: 0.03356006
(4)
movie_data1 = movie_data
url <- c("https://m.imdb.com/search/title/?title_type=feature&sort=release_date,asc&num_votes=2500,&groups=top_1000&country_of_origin=TR&count=250")
data_html <- read_html(url)
title_names <- data_html |> html_nodes('.ipc-title__text')
title_names <- html_text(title_names)
title_names <- tail(head(title_names,-1),-1)
title_names <- str_split(title_names, " ", n=2)
title_names <- unlist(lapply(title_names, function(x) {x[2]}))
year <- data_html |> html_nodes(".dli-title-metadata-item:nth-child(1)")
year <- as.numeric(html_text(year))
top_movies <- data.frame(
Title = title_names,
Year = year
)
kable (
top_movies,
caption = "Turkish Movies in the Top 1000",
)| Title | Year |
|---|---|
| Eskiya | 1996 |
| Her Sey Çok Güzel Olacak | 1998 |
| Vizontele | 2001 |
| G.O.R.A. | 2004 |
| Babam ve Oglum | 2005 |
| Nefes: Vatan Sagolsun | 2009 |
| Bir Zamanlar Anadolu’da | 2011 |
| Kis Uykusu | 2014 |
| Ayla: The Daughter of War | 2017 |
| Ahlat Agaci | 2018 |
| Yedinci Kogustaki Mucize | 2019 |
movie_data <- movie_data %>% mutate(Year = as.numeric(Year))
top_movies <- top_movies %>% mutate(Year = as.numeric(Year))
movies_top2 <- movies %>%
inner_join(top_movies, by = c("Title", "Year")) %>%
arrange(desc(Rating))
kable(
movies_top2,
caption = "Turkish Movies in the Top 1000 Full Table",
col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)| Title | Year | Duration | Rating | Votes |
|---|---|---|---|---|
| Ayla: The Daughter of War | 2017 | 125 | 8.3 | 42992 |
| Yedinci Kogustaki Mucize | 2019 | 132 | 8.2 | 54171 |
| Babam ve Oglum | 2005 | 108 | 8.2 | 91035 |
| Eskiya | 1996 | 128 | 8.1 | 71704 |
| Her Sey Çok Güzel Olacak | 1998 | 107 | 8.1 | 27122 |
| Kis Uykusu | 2014 | 196 | 8.0 | 54646 |
| Ahlat Agaci | 2018 | 188 | 8.0 | 27015 |
| Nefes: Vatan Sagolsun | 2009 | 128 | 8.0 | 35022 |
| G.O.R.A. | 2004 | 127 | 8.0 | 66033 |
| Vizontele | 2001 | 110 | 8.0 | 38403 |
| Bir Zamanlar Anadolu’da | 2011 | 157 | 7.8 | 49365 |
movie_data1 <- arrange(movie_data1, desc(Rating))
kable(
head(movie_data1, 11),
caption = "Turkish Movies in the Top 11",
col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)| Title | Year | Duration | Rating | Votes |
|---|---|---|---|---|
| Hababam Sinifi | 1975 | 87 | 9.2 | 42512 |
| CM101MMXI Fundamentals | 2013 | 139 | 9.1 | 46996 |
| Tosun Pasa | 1976 | 90 | 8.9 | 24329 |
| Hababam Sinifi Sinifta Kaldi | 1975 | 95 | 8.9 | 24369 |
| Süt Kardesler | 1976 | 80 | 8.8 | 20889 |
| Saban Oglu Saban | 1977 | 90 | 8.7 | 18534 |
| Zügürt Aga | 1985 | 101 | 8.7 | 16135 |
| Neseli Günler | 1978 | 95 | 8.7 | 11807 |
| Kibar Feyzo | 1978 | 83 | 8.7 | 17126 |
| Hababam Sinifi Uyaniyor | 1976 | 94 | 8.7 | 20640 |
| Canim Kardesim | 1973 | 85 | 8.6 | 10097 |