Assignment 2

The second assignment involves the Advanced Search feature on https://m.imdb.com/search, allowing us to list available movie data on IMDb. The task is to scrape the HTML and extract the results. Assignment II consists of four parts.

Part I

Defining URLs:

Show the code

#Defining the URL for those with a Release Date between 2010 and 2023
url1 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&num_votes=2500,&country_of_origin=TR&count=250"
#Defining the URL for those with a Release Date before 2010
url2 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&num_votes=2500,&country_of_origin=TR&count=250"

Part II

Loading necessary libraries:

Show the code

library(tidyverse) # for everything :)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Show the code

library(rvest) # for HTML scraping


Attaching package: 'rvest'

The following object is masked from 'package:readr':

    guess_encoding

Show the code

library(stringr) # for string processing
library(knitr)
library(ggplot2)

For Titles:

Show the code

title1 <- read_html(url1) %>% 
  html_elements(".ipc-title__text") %>% html_text()
title1 <- tail(head(title1,-1),-1)
title1 <- str_split(title1, " ", n=2)
title1 <- unlist(lapply(title1, function(x) {x[2]}))

title2 <- read_html(url2) %>%
  html_elements(".ipc-title__text") %>% html_text()
title2 <- tail(head(title2,-1),-1)
title2 <- str_split(title2, " ", n=2)
title2 <- unlist(lapply(title2, function(x) {x[2]}))
#Combining Titles
Title <- c(title1, title2)

For Years:

Show the code

year1 <- read_html(url1) %>%
  html_elements(".dli-title-metadata-item:nth-child(1)") %>%
  html_text() %>% as.numeric()

year2 <- read_html(url2) %>%
  html_elements(".dli-title-metadata-item:nth-child(1)") %>%
  html_text() %>% as.numeric()
#Combining Years
Year <- c(year1, year2)

For Durations :

Show the code

convert_time_updated <- function(time_string) {
  total_hours <- 0
  total_minutes <- 0
  
  if (grepl("h", time_string)) {
    time_parts <- strsplit(time_string, "h|m")[[1]]
    if (length(time_parts) >= 1) {
      total_hours <- as.numeric(time_parts[1])
    }
    if (length(time_parts) >= 2) {
      total_minutes <- as.numeric(time_parts[2])
    }
  } else {
    total_minutes <- as.numeric(gsub("m", "", time_string))
  }
  
  total_duration <- total_hours * 60 + total_minutes
  return(total_duration)
}

duration1 <- read_html(url1) %>%
  html_elements(".dli-title-metadata-item:nth-child(2)") %>%
  html_text()
duration1 <- unlist(lapply(duration1, convert_time_updated))

duration2 <- read_html(url2) %>%
  html_elements(".dli-title-metadata-item:nth-child(2)") %>%
  html_text()
duration2 <- unlist(lapply(duration2, convert_time_updated))

Duration <- c(duration1, duration2)

For Ratings :

Show the code

rating1 <- read_html(url1) %>%
  html_elements(".ratingGroup--imdb-rating") %>%
  html_text()
rating1 <- str_extract(rating1, "\\d+\\.\\d+") %>%
  as.numeric()

rating2 <- read_html(url2) %>%
  html_elements(".ratingGroup--imdb-rating") %>%
  html_text()
rating2 <- str_extract(rating2, "\\d+\\.\\d+") %>%
  as.numeric()

Rating <- c(rating1, rating2)

For Votes :

Show the code

votes1 <- read_html(url1) %>%
  html_elements(".kRnqtn") %>%
  html_text()
votes1 <- gsub("\\D", "", votes1) %>%
  as.numeric()

votes2 <- read_html(url2) %>%
  html_elements(".kRnqtn") %>%
  html_text()
votes2 <- gsub("\\D", "", votes2) %>%
  as.numeric()

Votes <- c(votes1, votes2)

Data Frame :

Show the code

movies <- data.frame(Title,Year,Duration,Rating,Votes)
print(head(movies, 15))

                       Title Year Duration Rating  Votes
1          Kuru Otlar Üstüne 2023      197    8.1   5090
2    Istanbul Için Son Çagri 2023       91    5.3   7398
3   Yedinci Kogustaki Mucize 2019      132    8.2  54172
4             Ölümlü Dünya 2 2023      117    7.5   3490
5                     Bihter 2023      113    3.6   3356
6               Ölümlü Dünya 2018      107    7.6  30270
7                 Kis Uykusu 2014      196    8.0  54647
8                     Dag II 2016      135    8.2 109870
9             Do Not Disturb 2023      114    6.3   8781
10 Ayla: The Daughter of War 2017      125    8.3  42992
11              Kurak Günler 2022      129    7.6  11167
12               Ahlat Agaci 2018      188    8.0  27015
13   Bir Zamanlar Anadolu'da 2011      157    7.8  49365
14       Dabbe: Cin Çarpmasi 2013      134    6.8   7109
15                    Baskin 2015       97    5.8  12107

Part III

A) Top 5 and Bottom 5 Movies by Rating

Top 5 Movies by Rating

Show the code

top_movies <- movies[order(movies$Rating, decreasing = TRUE),]

top5 <- head(top_movies, 5)
print(top5)

                           Title Year Duration Rating Votes
257               Hababam Sinifi 1975       87    9.2 42512
39        CM101MMXI Fundamentals 2013      139    9.1 46997
273                   Tosun Pasa 1976       90    8.9 24329
337 Hababam Sinifi Sinifta Kaldi 1975       95    8.9 24369
321                Süt Kardesler 1976       80    8.8 20889

I’ve watched all of them multiple times, and I believe they deserve the ratings they received.

Bottom 5 Movies by Rating

Show the code

# Arrange the data frame in descending order by Rating
bot_movies <- movies[order(movies$Rating, decreasing = TRUE),]

# Top 5 and bottom 5 movies based on user ratings
bottom5 <- tail(bot_movies, 5)
print(bottom5)

                             Title Year Duration Rating Votes
189                 Cumali Ceber 2 2018      100    1.2 10230
199                          Müjde 2022       48    1.2  9920
245              15/07 Safak Vakti 2021       95    1.2 20608
101 Cumali Ceber: Allah Seni Alsin 2017      100    1.0 39269
150                           Reis 2017      108    1.0 73974

I haven’t watched any of them, and I don’t intend to. Just the IMDb rating can convey a lot about a movie. I am definitely biased against these films :)

B) My Top 3 Favorite Turkish Movies

Show the code

favorite_movies <-movies %>% filter(Title %in% c("Masumiyet", "Babam ve Oglum", "Yahsi Bati"))
print(favorite_movies)

           Title Year Duration Rating Votes
1 Babam ve Oglum 2005      108    8.2 91037
2      Masumiyet 1997      110    8.1 19295
3     Yahsi Bati 2009      112    7.4 37784

I believe they deserve the ratings they received. The rating for Yahşi Batı could have been a bit higher. In my opinion, it’s Cem Yılmaz’s funniest movie, but some people think it’s not as good as GORA and AROG.

C)Plots

Average Ratings of Turkish Movies Over the Years

Show the code

movies %>%
  group_by(Year) %>% summarize(ave_rating = mean(Rating)) %>%
  ggplot(aes(Year, ave_rating)) + geom_point() +
  labs(x = "Year", y = "Rating Averages") +
  ggtitle("Average Ratings of Turkish Movies Over the Years") +
  theme(plot.title = element_text(hjust = 0.5))

Over the years, it is observed that the rating of Turkish movies has decreased. I wonder about the reason behind this. I believe it might be due to the lack of meaningful stories and scripts.

Number of Movies Over the Years

Show the code

ggplot(movies, aes(x = factor(Year))) +
  geom_bar(fill = "purple", color = "#FBDA21") +
  labs(x = "Year", y = "Number of Movies") +
  ggtitle("Number of Movies Over the Years") +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5))

Average Ratings Over the Years

Show the code

ggplot(movies, aes(x = Year, y = Rating, fill = factor(Year))) +
  geom_boxplot() +
  labs(x = "Year", y = "Rating") +
  ggtitle("Average Ratings Over the Years") +
  theme_minimal()+ theme(plot.title = element_text(hjust = 0.5))

D)Votes and Ratings

Correlation between Votes and Ratings:

Show the code

correlation <- cor(movies$Votes, movies$Rating)
print(correlation)

[1] 0.1308758

Show the code

ggplot(movies, aes(x = Votes, y = Rating, color = factor(Rating))) +
  geom_point(color = "purple") +
  geom_smooth(method = "lm", se = FALSE, color = "#FBDA21", formula = y ~ x) +
  labs(x = "Votes", y = "Rating") +
  ggtitle("Scatter Plot of Votes vs Rating") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

The value of 0.130875 represents a quite low correlation. In this case, we can say that there is a weak linear relationship between Votes and Ratings.

E)Durations and Ratings

Correlation between Durations and Ratings:

Show the code

correlation1 <- cor(movies$Duration, movies$Rating)
print(correlation1)

[1] 0.03356006

Show the code

ggplot(movies, aes(x = Duration, y = Rating, color = factor(Rating))) +
  geom_point(color = "purple") +
  geom_smooth(method = "lm", se = FALSE, color = "#FBDA21", formula = y ~ x) +
  labs(x = "Duration", y = "Rating") +
  ggtitle("Scatter Plot of Votes vs Rating") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

The value of 0.03356006 represents a quite low correlation. In this case, we can say that there is a weak linear relationship between Votes and Durations. When comparing Ratings with Votes and Rating with Durations, we can say that there is a stronger connection between Ratings and Votes.

Part IV

Turkish Movies Top 1000 Movies on IMDb

Show the code

url3 <- "https://m.imdb.com/search/title/?title_type=feature&groups=top_1000&country_of_origin=TR"

title3 <- read_html(url3) %>% 
  html_elements(".ipc-title__text") %>% html_text()
title3 <- tail(head(title3,-1),-1)
title3 <- str_split(title3, " ", n=2)
title3 <- unlist(lapply(title3, function(x) {x[2]}))

year3 <- read_html(url3)%>%html_elements(".dli-title-metadata-item:nth-child(1)")
year3 <- html_text(year3)
year3 <- as.factor(year3)

top1000 <- data.frame(Title=title3,Year=year3)
print(top1000)

                       Title Year
1   Yedinci Kogustaki Mucize 2019
2                 Kis Uykusu 2014
3      Nefes: Vatan Sagolsun 2009
4  Ayla: The Daughter of War 2017
5             Babam ve Oglum 2005
6                Ahlat Agaci 2018
7    Bir Zamanlar Anadolu'da 2011
8                     Eskiya 1996
9                   G.O.R.A. 2004
10                 Vizontele 2001
11  Her Sey Çok Güzel Olacak 1998

Combining Data Frames

Show the code

top1000$Year <- as.numeric(as.character(top1000$Year))

joineddata <- left_join(top1000,movies, by = c("Title", "Year"))
print(joineddata)

                       Title Year Duration Rating Votes
1   Yedinci Kogustaki Mucize 2019      132    8.2 54172
2                 Kis Uykusu 2014      196    8.0 54647
3      Nefes: Vatan Sagolsun 2009      128    8.0 35022
4  Ayla: The Daughter of War 2017      125    8.3 42992
5             Babam ve Oglum 2005      108    8.2 91037
6                Ahlat Agaci 2018      188    8.0 27015
7    Bir Zamanlar Anadolu'da 2011      157    7.8 49365
8                     Eskiya 1996      128    8.1 71704
9                   G.O.R.A. 2004      127    8.0 66032
10                 Vizontele 2001      110    8.0 38403
11  Her Sey Çok Güzel Olacak 1998      107    8.1 27122

Top 11 Turkish Movie Based on Rating

Show the code

top_movies <- movies[order(movies$Rating, decreasing = TRUE),]

top11 <- head(top_movies, 11)
print(top11)

                           Title Year Duration Rating Votes
257               Hababam Sinifi 1975       87    9.2 42512
39        CM101MMXI Fundamentals 2013      139    9.1 46997
273                   Tosun Pasa 1976       90    8.9 24329
337 Hababam Sinifi Sinifta Kaldi 1975       95    8.9 24369
321                Süt Kardesler 1976       80    8.8 20889
284             Saban Oglu Saban 1977       90    8.7 18534
307                   Zügürt Aga 1985      101    8.7 16136
317                Neseli Günler 1978       95    8.7 11807
323                  Kibar Feyzo 1978       83    8.7 17126
380      Hababam Sinifi Uyaniyor 1976       94    8.7 20640
343               Canim Kardesim 1973       85    8.6 10097

The first data frame does not have a single film that matches the IMDb top 1000. I believe the differences I observed when comparing the two tables are in the film years and votes. It seems that the Top 1000 includes more recent films with significantly higher votes. (Note: I don’t know why CM101MMXI Fundamentals is in the second list 😊)