Assignment 2

Assignment 2

Import Libraries and Data

Code
library(tidyverse)
library(rvest)
library(stringr)
library(reshape2)
library(ggplot2)
library(knitr)
library(httr)
library(kableExtra)

base_url <- "https://www.imdb.com/search/title/?title_type=feature&num_votes=2500,&country_of_origin=TR&count=250"

url_vector <- c(
  sprintf("%s&release_date=2010-01-01,2023-12-31", base_url),
  sprintf("%s&release_date=,2009-12-31", base_url)
)

Data Cleaning and Creating The Dataframe

Code
movie_titles <- c()
movie_years <- c()
movie_durations <- c()
movie_ratings <- c()
movie_votes <- c()

for(url in url_vector){
  HTML = read_html(url)
  
  title_names <- HTML %>% html_nodes('.ipc-title__text')
  title_names <- html_text(title_names)
  title_names <- tail(head(title_names,-1),-1)
  title_names <- str_split(title_names, " ", n=2)
  title_names <- unlist(lapply(title_names, function(x) {x[2]}))
  
  year <- HTML %>% html_nodes(".sc-43986a27-7.dBkaPT.dli-title-metadata")
  year <- html_text(year)
  year <- substr(year, 1, 4)
  year <- as.numeric(year)
  
  duration_trash <- HTML %>% html_nodes(".sc-43986a27-7.dBkaPT.dli-title-metadata")
  duration_trash <- html_text(duration_trash)
  duration <- c()
  
  for (string in duration_trash){
  start_index <- 5
  string_length <- str_length(string)

  if(grepl("m", string, fixed = TRUE)){
    end_index <- regexpr("m", string)
    result <- substr(string, start_index, end_index)
    duration <- append(duration,result)
    }
    
  else{
    end_index <- regexpr("h", string)
    result <- substr(string, start_index, end_index)
    duration <- append(duration, result)
    }
  }
    
  
  hour_duration <- str_split(duration, " ")
  hour_duration <- sapply(hour_duration, function(x) ifelse(grepl("h", x[1], fixed = TRUE), x[1], 0))
  hour_duration <- sub("h", "", hour_duration)
  hour_duration <- as.numeric(hour_duration)
  hour_duration <- hour_duration * 60
  
  minute_duration <- str_split(duration, " ")
  minute_duration <- sapply(minute_duration, function(x) ifelse(length(x) >= 2, x[2], ifelse(grepl("m", x, fixed = TRUE), x[1], ifelse(grepl("m", x[1], fixed = TRUE), x[1],0))))
  minute_duration <- sub("m", "", minute_duration)
  minute_duration <- as.numeric(minute_duration)
  
  rating <- HTML %>% html_nodes(".ipc-rating-star.ipc-rating-star--base.ipc-rating-star--imdb.ratingGroup--imdb-rating")
  rating <- html_text(rating)
  rating <- substr(rating, 1, 3)
  rating <- as.numeric(rating)
  
  vote <- HTML %>% html_nodes(".sc-53c98e73-0.kRnqtn")
  vote <- html_text(vote)
  vote <- sub("Votes", "" ,vote)
  vote <- sub(",", "", vote)
  vote <- as.numeric(vote)
  
  movie_titles <- append(movie_titles,title_names)
  movie_years <- append(movie_years, year)
  movie_durations <- append(movie_durations, hour_duration + minute_duration)
  movie_ratings <- append(movie_ratings, rating)
  movie_votes <- append(movie_votes, vote)
  
}

movies_df <- data.frame(movie_titles, movie_years, movie_durations, movie_ratings, movie_votes)
kable(head(movies_df,10), caption = "Movies Dataframe")
Movies Dataframe
movie_titles movie_years movie_durations movie_ratings movie_votes
Kuru Otlar Üstüne 2023 197 8.1 5081
Istanbul Için Son Çagri 2023 91 5.3 7376
Yedinci Kogustaki Mucize 2019 132 8.2 54161
Ölümlü Dünya 2 2023 117 7.5 3472
Bihter 2023 113 3.6 3350
Ölümlü Dünya 2018 107 7.6 30267
Kis Uykusu 2014 196 8.0 54642
Dag II 2016 135 8.2 109866
Do Not Disturb 2023 114 6.3 8773
Ayla: The Daughter of War 2017 125 8.3 42991

Examine the Structure of ‘movies_df’

Code
str(movies_df)
'data.frame':   470 obs. of  5 variables:
 $ movie_titles   : chr  "Kuru Otlar Üstüne" "Istanbul Için Son Çagri" "Yedinci Kogustaki Mucize" "Ölümlü Dünya 2" ...
 $ movie_years    : num  2023 2023 2019 2023 2023 ...
 $ movie_durations: num  197 91 132 117 113 107 196 135 114 125 ...
 $ movie_ratings  : num  8.1 5.3 8.2 7.5 3.6 7.6 8 8.2 6.3 8.3 ...
 $ movie_votes    : num  5081 7376 54161 3472 3350 ...

Top 5 movies

Code
top5_movies <- head(movies_df[order(movies_df$movie_ratings, decreasing = TRUE), ], 5)
top5_movies
                    movie_titles movie_years movie_durations movie_ratings
257               Hababam Sinifi        1975              87           9.2
39        CM101MMXI Fundamentals        2013             139           9.1
273                   Tosun Pasa        1976              90           8.9
337 Hababam Sinifi Sinifta Kaldi        1975              95           8.9
321                Süt Kardesler        1976              80           8.8
    movie_votes
257       42513
39        46996
273       24329
337       24370
321       20888

To begin with, I saw every one of them except the “Süt Kardeşler”, and I think the ratings of Hababam Class film series and “Tosun Paşa” are very accurate. “Hababam Class” was a heartfelt and genuine film series, particularly in the way it represented the cultural norms of the time and its lovable character. Regarding “Fundamentals” I think it was among Cem Yılmaz’s greatest pieces, but after seeing some of his more recent works, I don’t feel a strong connection to the earlier ones. I disagree that it should be in second place because of this.

Worst 5 movies

Code
bottom5_movies <- head(movies_df[order(movies_df$movie_ratings), ], 5)
bottom5_movies
                      movie_titles movie_years movie_durations movie_ratings
101 Cumali Ceber: Allah Seni Alsin        2017             100           1.0
150                           Reis        2017             108           1.0
189                 Cumali Ceber 2        2018             100           1.2
199                          Müjde        2022              48           1.2
245              15/07 Safak Vakti        2021              95           1.2
    movie_votes
101       39267
150       73973
189       10229
199        9920
245       20608

In fact, I haven’t watched any of them, and some of them I’ve never even heard of. However, even if I have to push myself, I’m considering seeing “Cumali Ceber: May God Take You” after this assignment is done :))

My best movies

Code
my_movies <- movies_df[movies_df$movie_titles %in% c("Gemide", "Av Mevsimi", "Kaybedenler Kulübü"), ]
my_movies
          movie_titles movie_years movie_durations movie_ratings movie_votes
48  Kaybedenler Kulübü        2011             105           7.5       25128
56          Av Mevsimi        2010             140           7.4       36262
278             Gemide        1998             102           7.9       15722

This table leads me to believe that I don’t watch movies with ratings higher than eight. :)

Plots

Scatter Plot of Average Ratings of Movies Over the Years

Code
movies_df$movie_years <- as.factor(movies_df$movie_years)

# Calculate yearly rating averages
rating_avg_by_year <- movies_df %>%
  group_by(movie_years) %>%
  summarise(avg_rating = mean(movie_ratings),
            num_movies = n())

# Scatter plot of yearly rating averages with rotated year labels
ggplot(rating_avg_by_year, aes(x = movie_years, y = avg_rating)) +
  geom_point() +
  labs(title = "Scatter Plot of Average Ratings of Turkish Movies Over the Years",
       x = "Year",
       y = "Average Rating") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Box Plot of Ratings of Movies Over the Years

Code
ggplot(movies_df, aes(x = movie_years, y = movie_ratings)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, hjust=1)) +
  labs(title = "Box Plot of Ratings of Turkish Movies Over the Years",
       x = "Year",
       y = "Rating")

Correlation between Votes and Ratings

Code
ggplot(movies_df, aes(x = movie_votes, y = movie_ratings)) +
  geom_point() +
  labs(title = "Scatter Plot of Votes vs Ratings",
       x = "Number of Votes",
       y = "Ratings")

Numerical Representation of the Above Graph (Correlation)

Code
correlation <- cor(movies_df$movie_votes, movies_df$movie_ratings, use = "complete.obs")

cat("Correlation between Votes and Ratings:", correlation, "\n")
Correlation between Votes and Ratings: 0.1307806 

Correlation between Duration and Ratings

Code
correlation_duration_rating <- cor(movies_df$movie_durations, movies_df$movie_ratings, use = "complete.obs")
cat("Correlation between Duration and Ratings:", correlation_duration_rating, "\n")
Correlation between Duration and Ratings: 0.03343216 
Code
ggplot(movies_df, aes(x = movie_durations, y = movie_ratings)) +
  geom_point() +
  labs(title = "Scatter Plot of Duration vs Ratings",
       x = "Duration (minutes)",
       y = "Ratings")

Numerical Representation of the Above Graph (Correlation)

Code
correlation_2 <- cor(movies_df$movie_durations, movies_df$movie_ratings, use = "complete.obs")

cat("Correlation between Duration and Ratings:", correlation_2, "\n")
Correlation between Duration and Ratings: 0.03343216 

Turkish Movies in IMDb Top 1000

Code
URL_3 = "https://www.imdb.com/search/title/?title_type=feature&groups=top_1000&country_of_origin=TR&count=250"

movie_name <- c()
movie_year <- c()

HTML <- read_html(URL_3)

title_names <- HTML %>% html_nodes('.ipc-title__text')
title_names <- html_text(title_names)
title_names <- tail(head(title_names, -1), -1)
title_names <- str_split(title_names, " ", n = 2)
title_names <- unlist(lapply(title_names, function(x) x[2]))

year <- HTML %>% html_nodes(".sc-43986a27-7.dBkaPT.dli-title-metadata")
year <- html_text(year)
year <- substr(year, 1, 4)
year <- as.numeric(year)

movie_name <- append(movie_name, title_names)
movie_year <- append(movie_year, year)

top1000_df <- data.frame(movie_name, movie_year)

top1000_df %>%
  kable() %>%
  kable_styling(full_width = FALSE)
movie_name movie_year
Yedinci Kogustaki Mucize 2019
Kis Uykusu 2014
Nefes: Vatan Sagolsun 2009
Ayla: The Daughter of War 2017
Babam ve Oglum 2005
Ahlat Agaci 2018
Bir Zamanlar Anadolu'da 2011
Eskiya 1996
G.O.R.A. 2004
Vizontele 2001
Her Sey Çok Güzel Olacak 1998

Merging the Dataframe to Expand the other Columns

Code
top1000_df_merged <- merge(
  x = top1000_df,
  y = movies_df,
  by.x = c("movie_name", "movie_year"),
  by.y = c("movie_titles", "movie_years"),
  all.x = TRUE
)

top1000_df_merged %>%
  kable() %>%
  kable_styling(full_width = FALSE)
movie_name movie_year movie_durations movie_ratings movie_votes
Ahlat Agaci 2018 188 8.0 27011
Ayla: The Daughter of War 2017 125 8.3 42991
Babam ve Oglum 2005 108 8.2 91035
Bir Zamanlar Anadolu'da 2011 157 7.8 49359
Eskiya 1996 128 8.1 71703
G.O.R.A. 2004 127 8.0 66032
Her Sey Çok Güzel Olacak 1998 107 8.1 27122
Kis Uykusu 2014 196 8.0 54642
Nefes: Vatan Sagolsun 2009 128 8.0 35020
Vizontele 2001 110 8.0 38402
Yedinci Kogustaki Mucize 2019 132 8.2 54161

Ordered by Rankings

Code
top1000_df_merged <- top1000_df_merged[order(top1000_df_merged$movie_ratings, decreasing = TRUE),]

top1000_df_merged %>%
  kable() %>%
  kable_styling(full_width = FALSE)
movie_name movie_year movie_durations movie_ratings movie_votes
2 Ayla: The Daughter of War 2017 125 8.3 42991
3 Babam ve Oglum 2005 108 8.2 91035
11 Yedinci Kogustaki Mucize 2019 132 8.2 54161
5 Eskiya 1996 128 8.1 71703
7 Her Sey Çok Güzel Olacak 1998 107 8.1 27122
1 Ahlat Agaci 2018 188 8.0 27011
6 G.O.R.A. 2004 127 8.0 66032
8 Kis Uykusu 2014 196 8.0 54642
9 Nefes: Vatan Sagolsun 2009 128 8.0 35020
10 Vizontele 2001 110 8.0 38402
4 Bir Zamanlar Anadolu'da 2011 157 7.8 49359

Movies Dataframe’s First 11, by ranking

Code
movies_df %>%
  arrange(desc(movie_ratings)) %>%
  head(11) %>%
  kable(caption = "First 11 Movies Sorted by Rating") %>%
  kable_styling(full_width = FALSE)
First 11 Movies Sorted by Rating
movie_titles movie_years movie_durations movie_ratings movie_votes
Hababam Sinifi 1975 87 9.2 42513
CM101MMXI Fundamentals 2013 139 9.1 46996
Tosun Pasa 1976 90 8.9 24329
Hababam Sinifi Sinifta Kaldi 1975 95 8.9 24370
Süt Kardesler 1976 80 8.8 20888
Saban Oglu Saban 1977 90 8.7 18535
Zügürt Aga 1985 101 8.7 16135
Neseli Günler 1978 95 8.7 11807
Kibar Feyzo 1978 83 8.7 17128
Hababam Sinifi Uyaniyor 1976 94 8.7 20640
Canim Kardesim 1973 85 8.6 10097

It is obvious from the fact that no movie appears again in the two dataframes that IMDb considers other factors in addition to ratings when identifying the top 1000 films.

It’s also unexpected that the oldest film in the top 1000 was released in 1996 while the best movie_df movies are generally from the 80s.

Back to top