Assignment 2

(1) Getting URL

Code

url <- c("https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&sort=release_date,asc&num_votes=2500,&country_of_origin=TR&count=250","https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&sort=release_date,asc&num_votes=2500,&country_of_origin=TR&count=250")

(2) Preparing of DataFrame

Code

library(tidyverse)
library(rvest)
library(stringr)
library(knitr)


convert_time <- function(time_str) {
  hours <- 0
  minutes <- 0
  
  if (grepl("h", time_str)) {
    time_components <- strsplit(time_str, "h|m")[[1]]
    if (length(time_components) >= 1) {
      hours <- as.numeric(time_components[1])
    }
    if (length(time_components) >= 2) {
      minutes <- as.numeric(time_components[2])
    }
  } else {
    minutes <- as.numeric(gsub("m", "", time_str))
  }
  
  total_minutes <- hours * 60 + minutes
  return(total_minutes)
}
#PART URL1

data_html <- read_html(url[1])
title_names1 <- data_html |> html_nodes('.ipc-title__text')
title_names1 <- html_text(title_names1)
title_names1 <- tail(head(title_names1,-1),-1)
title_names1 <- str_split(title_names1, " ", n=2)
title_names1 <- unlist(lapply(title_names1, function(x) {x[2]}))

year1 <- data_html |> html_nodes(".dli-title-metadata-item:nth-child(1)")
year1 <- as.numeric(html_text(year1))

duration1 <- data_html |> html_nodes((".dli-title-metadata-item:nth-child(2)"))
duration1 <- html_text(duration1)
duration1 <- unlist(lapply(duration1, convert_time))

rating1 <- data_html |> html_nodes(".ratingGroup--imdb-rating")
rating1 <- html_text(rating1)
rating1 <- as.numeric(str_extract(rating1, "\\d+\\.\\d+"))

votes1 <- data_html |> html_nodes(".kRnqtn")
votes1 <- html_text(votes1)
votes1 <- as.numeric(gsub("\\D", "", votes1))

#PART URL2

data_html <- read_html(url[2])
title_names2 <- data_html |> html_nodes('.ipc-title__text')
title_names2 <- html_text(title_names2)
title_names2 <- tail(head(title_names2,-1),-1)
title_names2 <- str_split(title_names2, " ", n=2)
title_names2 <- unlist(lapply(title_names2, function(x) {x[2]}))

year2 <- data_html |> html_nodes(".dli-title-metadata-item:nth-child(1)")
year2 <- as.numeric(html_text(year2))

duration2 <- data_html |> html_nodes((".dli-title-metadata-item:nth-child(2)"))
duration2 <- html_text(duration2)
duration2 <- unlist(lapply(duration2, convert_time))

rating2 <- data_html |> html_nodes(".ratingGroup--imdb-rating")
rating2 <- html_text(rating2)
rating2 <- as.numeric(str_extract(rating2, "\\d+\\.\\d+"))

votes2 <- data_html |> html_nodes(".kRnqtn")
votes2 <- html_text(votes2)
votes2 <- as.numeric(gsub("\\D", "", votes2))

Title <- c(title_names1,title_names2)
Year <- c(year1,year2)
Duration <- c(duration1,duration2)
Rating <- c(rating1,rating2)
Votes <- c(votes1,votes2)

movie_data <- data.frame(
  Title = Title,
  Year = Year,
  Duration = Duration,
  Rating = Rating,
  Votes = Votes
)

(3) Analytics of Movies

Part A

Code

movies <- movie_data %>% 
  arrange(desc(Rating))

Top 5 movies based on ratings are shown below.

I think it wouldn’t be wrong to say that whatever exists exists in the past.

Code

kable(
  movies %>%
    head(5),
  caption = "Top 5 Movies",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)

Top 5 Movies
Title	Year	Duration	Rating	Votes
Hababam Sinifi	1975	87	9.2	42520
CM101MMXI Fundamentals	2013	139	9.1	47001
Hababam Sinifi Sinifta Kaldi	1975	95	8.9	24372
Tosun Pasa	1976	90	8.9	24331
Süt Kardesler	1976	80	8.8	20891

The bottom 5 is shown below.

The new ones are obvious

Code

kable(
  movies %>%
    tail(5),
  caption = "Bottom 5 Movies",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)

Bottom 5 Movies
	Title	Year	Duration	Rating	Votes
466	Cumali Ceber 2	2018	100	1.2	10229
467	15/07 Safak Vakti	2021	95	1.2	20608
468	Müjde	2022	48	1.2	9920
469	Reis	2017	108	1.0	73974
470	Cumali Ceber: Allah Seni Alsin	2017	100	1.0	39268

Part B

Code

kable(
  movie_data %>%
    filter(Title == "Ölümlü Dünya" | Title == "Yedinci Kogustaki Mucize" | Title == "Dag II"),
  caption = "My Best Three",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)

My Best Three
Title	Year	Duration	Rating	Votes
Dag II	2016	135	8.2	109882
Ölümlü Dünya	2018	107	7.6	30296
Yedinci Kogustaki Mucize	2019	132	8.2	54200

Part C

Code

movie_data %>% 
  group_by(Year) %>%
  summarize(yearly_average = mean(Rating)) %>%
  ggplot(aes(x = Year, y = yearly_average)) + geom_point() +
  ggtitle("Yearly Rating Averages")

The plot clearly show that ratings decrease over the years.

Code

ggplot(movie_data, aes(x = factor(Year))) +
  geom_bar(fill = "blue", color = "black") +
  labs(x = "Year", y = "Number of Movies") +
  ggtitle("Number of Movies Over the Years") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90)
  )

Code

movie_data$Year <- as.factor(movie_data$Year)
ggplot(movie_data, aes(x = Year, y = Rating, fill = factor(Year))) +
  geom_boxplot(color = "black",width = 0.5) +
  labs(x = "Year", y = "Rating") +
  ggtitle("Box Plots of Movie Ratings Over the Years") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90)
  )

Part D

Code

corr <- cor(movie_data$Votes, movie_data$Rating)
cat("Correlation between Votes and Ratings:", corr)

Correlation between Votes and Ratings: 0.1311088

The correlation between votes and ratings is weak.

Part E

Code

corr <- cor(movie_data$Duration, movie_data$Rating)
cat("Correlation between Votes and Ratings:", corr)

Correlation between Votes and Ratings: 0.03331057

There is no correlation between duration and ratings.

(4) Turkish Movies in Top 1000

Code

movie_data1 = movie_data
url <- c("https://m.imdb.com/search/title/?title_type=feature&sort=release_date,asc&num_votes=2500,&groups=top_1000&country_of_origin=TR&count=250")
data_html <- read_html(url)
title_names <- data_html |> html_nodes('.ipc-title__text')
title_names <- html_text(title_names)
title_names <- tail(head(title_names,-1),-1)
title_names <- str_split(title_names, " ", n=2)
title_names <- unlist(lapply(title_names, function(x) {x[2]}))

year <- data_html |> html_nodes(".dli-title-metadata-item:nth-child(1)")
year <- as.numeric(html_text(year))

top_movies <- data.frame(
  Title = title_names,
  Year = year
  )
kable (
  top_movies,
  caption = "Turkish Movies in the Top 1000",
)

Turkish Movies in the Top 1000
Title	Year
Eskiya	1996
Her Sey Çok Güzel Olacak	1998
Vizontele	2001
G.O.R.A.	2004
Babam ve Oglum	2005
Nefes: Vatan Sagolsun	2009
Bir Zamanlar Anadolu’da	2011
Kis Uykusu	2014
Ayla: The Daughter of War	2017
Ahlat Agaci	2018
Yedinci Kogustaki Mucize	2019

Code

movie_data <- movie_data %>% mutate(Year = as.numeric(Year))
top_movies <- top_movies %>% mutate(Year = as.numeric(Year))

movies_top2 <- movies %>%
  inner_join(top_movies, by = c("Title", "Year")) %>%
  arrange(desc(Rating))
kable(
  movies_top2,
  caption = "Turkish Movies in the Top 1000 Full Table",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)

Turkish Movies in the Top 1000 Full Table
Title	Year	Duration	Rating	Votes
Ayla: The Daughter of War	2017	125	8.3	43005
Yedinci Kogustaki Mucize	2019	132	8.2	54200
Babam ve Oglum	2005	108	8.2	91054
Eskiya	1996	128	8.1	71707
Her Sey Çok Güzel Olacak	1998	107	8.1	27129
Kis Uykusu	2014	196	8.0	54664
Ahlat Agaci	2018	188	8.0	27029
Vizontele	2001	110	8.0	38412
G.O.R.A.	2004	127	8.0	66042
Nefes: Vatan Sagolsun	2009	128	8.0	35032
Bir Zamanlar Anadolu’da	2011	157	7.8	49380

Code

movie_data1 <- arrange(movie_data1, desc(Rating))

kable(
  head(movie_data1, 11),
  caption = "Turkish Movies in the Top 11",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)

Turkish Movies in the Top 11
Title	Year	Duration	Rating	Votes
Hababam Sinifi	1975	87	9.2	42520
CM101MMXI Fundamentals	2013	139	9.1	47001
Hababam Sinifi Sinifta Kaldi	1975	95	8.9	24372
Tosun Pasa	1976	90	8.9	24331
Süt Kardesler	1976	80	8.8	20891
Hababam Sinifi Uyaniyor	1976	94	8.7	20642
Saban Oglu Saban	1977	90	8.7	18537
Neseli Günler	1978	95	8.7	11811
Kibar Feyzo	1978	83	8.7	17127
Zügürt Aga	1985	101	8.7	16141
Canim Kardesim	1973	85	8.6	10099

Upon examining the table, it becomes apparent that the initial 11 highest-rated movies differ from those listed in our original data frame. A notable distinction lies in the release years of these films, with a conspicuous absence of older productions in the TOP 1000 IMDB list. It is probable that the inclusion criteria for this list prioritize movies released after a specific year.