(1)

Code
url1 <- c("https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&sort=release_date,asc&num_votes=2500,&country_of_origin=TR&count=250")
url2 <- c("https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&sort=release_date,asc&num_votes=2500,&country_of_origin=TR&count=250")

(2)

Code
library(tidyverse)
library(rvest)
library(stringr)
library(knitr)


convert_time <- function(time_str) {
  hours <- 0
  minutes <- 0
  
  if (grepl("h", time_str)) {
    time_components <- strsplit(time_str, "h|m")[[1]]
    if (length(time_components) >= 1) {
      hours <- as.numeric(time_components[1])
    }
    if (length(time_components) >= 2) {
      minutes <- as.numeric(time_components[2])
    }
  } else {
    minutes <- as.numeric(gsub("m", "", time_str))
  }
  
  total_minutes <- hours * 60 + minutes
  return(total_minutes)
}

data_html <- read_html(url1)
title_names1 <- data_html %>%
  html_elements(".ipc-title__text") %>%
  html_text()
title_names1 <- tail(head(title_names1,-1),-1)
title_names1 <- str_split(title_names1, " ", n=2)
title_names1 <- unlist(lapply(title_names1, function(x) {x[2]}))

year1 <- data_html %>% 
  html_nodes(".dli-title-metadata-item:nth-child(1)") %>%
  html_text() %>%
  as.numeric()

duration1 <- data_html %>%
  html_elements(".dli-title-metadata-item:nth-child(2)") %>%
  html_text()
duration1 <- unlist(lapply(duration1, convert_time))

rating1 <- data_html %>%
  html_elements(".ratingGroup--imdb-rating") %>%
  html_text()
rating1 <- str_extract(rating1, "\\d+\\.\\d+") %>%
  as.numeric()

votes1 <- data_html %>%
  html_elements(".kRnqtn") %>%
  html_text()
votes1 <- gsub("\\D", "", votes1) %>%
  as.numeric()

#PART URL2

data_html <- read_html(url2)
title_names2 <- data_html %>%
  html_elements(".ipc-title__text") %>%
  html_text()
title_names2 <- tail(head(title_names2,-1),-1)
title_names2 <- str_split(title_names2, " ", n=2)
title_names2 <- unlist(lapply(title_names2, function(x) {x[2]}))

year2 <- data_html %>% 
  html_nodes(".dli-title-metadata-item:nth-child(1)") %>%
  html_text() %>%
  as.numeric()

duration2 <- data_html %>%
  html_elements(".dli-title-metadata-item:nth-child(2)") %>%
  html_text()
duration2 <- unlist(lapply(duration2, convert_time))

rating2 <- data_html %>%
  html_elements(".ratingGroup--imdb-rating") %>%
  html_text()
rating2 <- str_extract(rating2, "\\d+\\.\\d+") %>%
  as.numeric()

votes2 <- data_html %>%
  html_elements(".kRnqtn") %>%
  html_text()
votes2 <- gsub("\\D", "", votes2) %>%
  as.numeric()

title <- c(title_names1,title_names2)
year <- c(year1,year2)
duration <- c(duration1,duration2)
rating <- c(rating1,rating2)
votes <- c(votes1,votes2)

moviedf <- data.frame(
  Title = title,
  Year = year,
  Duration = duration,
  Rating = rating,
  Votes = votes
)

(3)

Part A

Code
movies <- moviedf %>% 
  arrange(desc(Rating))
Code
kable(
  movies %>%
    arrange(desc(rating)) %>%
    head(5),
  format = "html",
  caption = "Top 5 Movies based on Rating",
  col.names = c("Title", "Year", "Duration", "Rating", "Votes")
)
Top 5 Movies based on Rating
Title Year Duration Rating Votes
Hükümet Kadin 2013 99 6.5 9184
Arabesk 1989 92 8.0 8303
Senden Bana Kalan 2015 117 6.5 3403
Yolun Açik Olsun 2022 119 6.5 2699
Aykut Eniste 2 2021 115 6.5 3597
Code
kable(movies %>% tail(5), caption = "Bottom 5", col.names = c("Title", "Year", "Duration", "Rating", "Votes"))
Bottom 5
Title Year Duration Rating Votes
466 Cumali Ceber 2 2018 100 1.2 10229
467 15/07 Safak Vakti 2021 95 1.2 20608
468 Müjde 2022 48 1.2 9920
469 Reis 2017 108 1.0 73974
470 Cumali Ceber: Allah Seni Alsin 2017 100 1.0 39268

Part B

Code
kable(moviedf %>% filter(Title == "Recep Ivedik 3" | Title == "Eyyvah Eyvah" | Title == "Çok Filim Hareketler Bunlar"), caption = "My Bests", col.names = c("Title", "Year", "Duration", "Rating", "Votes"))
My Bests
Title Year Duration Rating Votes
Recep Ivedik 3 2010 95 4.2 20792
Eyyvah Eyvah 2010 104 7.0 21447
Çok Filim Hareketler Bunlar 2010 104 4.5 5161

Part C

Code
moviedf %>% 
  group_by(Year) %>%
  summarize(avg = mean(Rating)) %>%
  ggplot(aes(x = Year, y = avg)) + geom_point() +
  ggtitle("Yearly Rating Averages")

The plot clearly show that ratings decrease over the years.

Code
ggplot(moviedf, aes(x = factor(Year))) +
  geom_bar(fill = "yellow", color = "black") +
  ggtitle("Number of Movies Over the Years") +
  labs(x = "Year", y = "Number of Movies") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 60)
  )

Code
moviedf$Year <- as.factor(moviedf$Year)
ggplot(moviedf, aes(x = Year, y = Rating, fill = factor(Year))) +
  geom_boxplot(color = "black",width = 0.7) +
  labs(x = "Year", y = "Rating") +
  ggtitle("Box Plots of Movie Ratings") +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 60)
  )

Part D

Code
corr <- cor(moviedf$Votes, moviedf$Rating)
cat("Correlation between Votes and Ratings:", corr)
Correlation between Votes and Ratings: 0.1311088

The correlation between votes and ratings.

Part E

Code
corr <- cor(moviedf$Duration, moviedf$Rating)
cat("Correlation between Votes and Ratings:", corr)
Correlation between Votes and Ratings: 0.03331057

The correlation between duration and ratings.

(4) Turkish Movies in Top 1000

Code
moviedf1 = moviedf
url <- c("https://m.imdb.com/search/title/?title_type=feature&sort=release_date,asc&num_votes=2500,&groups=top_1000&country_of_origin=TR&count=250")

data_html <- read_html(url)
title_names <- data_html %>%
  html_elements(".ipc-title__text") %>%
  html_text()
title_names <- tail(head(title_names,-1),-1)
title_names <- str_split(title_names, " ", n=2)
title_names <- unlist(lapply(title_names, function(x) {x[2]}))

year <- data_html %>% 
  html_nodes(".dli-title-metadata-item:nth-child(1)") %>%
  html_text() %>%
  as.numeric()

bestmovies <- data.frame(
  Title = title_names,
  Year = year
  )
kable (bestmovies, caption = "Turkish Movies in the Top 1000",format = "simple")
Turkish Movies in the Top 1000
Title Year
Eskiya 1996
Her Sey Çok Güzel Olacak 1998
Vizontele 2001
G.O.R.A. 2004
Babam ve Oglum 2005
Nefes: Vatan Sagolsun 2009
Bir Zamanlar Anadolu’da 2011
Kis Uykusu 2014
Ayla: The Daughter of War 2017
Ahlat Agaci 2018
Yedinci Kogustaki Mucize 2019
Code
moviedf <- moviedf %>% mutate(Year = as.numeric(Year))
bestmovies <- bestmovies %>% mutate(Year = as.numeric(Year))

bestmovies2 <- movies %>%
  inner_join(bestmovies, by = c("Title", "Year")) %>%
  arrange(desc(Rating))
kable(bestmovies2, caption = "Turkish Movies in the Top 1000 Full Table", col.names = c("Title", "Year", "Duration", "Rating", "Votes"))
Turkish Movies in the Top 1000 Full Table
Title Year Duration Rating Votes
Ayla: The Daughter of War 2017 125 8.3 43005
Yedinci Kogustaki Mucize 2019 132 8.2 54200
Babam ve Oglum 2005 108 8.2 91054
Eskiya 1996 128 8.1 71707
Her Sey Çok Güzel Olacak 1998 107 8.1 27129
Kis Uykusu 2014 196 8.0 54664
Ahlat Agaci 2018 188 8.0 27029
Vizontele 2001 110 8.0 38412
G.O.R.A. 2004 127 8.0 66042
Nefes: Vatan Sagolsun 2009 128 8.0 35032
Bir Zamanlar Anadolu’da 2011 157 7.8 49380
Code
moviedf1 <- arrange(moviedf1, desc(Rating))

kable(head(moviedf1, 11), caption = "Turkish Movies in the Top 11", col.names = c("Title", "Year", "Duration", "Rating", "Votes"))
Turkish Movies in the Top 11
Title Year Duration Rating Votes
Hababam Sinifi 1975 87 9.2 42520
CM101MMXI Fundamentals 2013 139 9.1 47001
Hababam Sinifi Sinifta Kaldi 1975 95 8.9 24372
Tosun Pasa 1976 90 8.9 24331
Süt Kardesler 1976 80 8.8 20891
Hababam Sinifi Uyaniyor 1976 94 8.7 20642
Saban Oglu Saban 1977 90 8.7 18537
Neseli Günler 1978 95 8.7 11811
Kibar Feyzo 1978 83 8.7 17127
Zügürt Aga 1985 101 8.7 16141
Canim Kardesim 1973 85 8.6 10099

Upon analyzing the table, it is evident that the top 11 highest-rated movies deviate from those present in our original data frame. A significant contrast is noticeable in the release years of these films, with a conspicuous lack of older productions within the TOP 1000 IMDB list. It is likely that the criteria for inclusion in this list prioritize movies released after a certain year.

Back to top