Assignment 2

Analysis of Turkish Movies (IMBD)

Movies are filtered and URL’s are saved

Code

# searches between  01/01/2010-31/12/2023
url_1 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&num_votes=2500,&country_of_origin=TR&count=250" 

# searches before 31/12/2009
url_2 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&num_votes=2500,&country_of_origin=TR&count=250"

combined_url <- c(url_1, url_2)

Necessary packages are imported

Code

knitr::opts_chunk$set(warning = FALSE)
library(tidyverse)
library(rvest)
library(stringr)
library(ggplot2)
library(ggthemes)

Web-Scrapping

Turkish movies with have minimum 2500 vote are filtered.

Code

title <- c()
release_year <- c()
duration <- c()
rating <- c()
vote <- c()
for (url in combined_url) {
  Html <- read_html(url)
  
  title_names <- Html |> html_nodes('.ipc-title__text')
  title_names <- html_text(title_names)
  title_names <- tail(head(title_names,-1),-1)
  title_names <- str_split(title_names, " ", n=2)
  title_names <- unlist(lapply(title_names, function(x) {x[2]}))
  title <- append(title,title_names)
  
  years <- Html |> html_nodes('.sc-43986a27-7.dBkaPT.dli-title-metadata')
  years <- html_text(years)
  years <- unlist(lapply(years, function(years){
           strtrim(years, 4)}))
  release_year <- append(release_year, as.numeric(years))
  
  durations <- Html |> html_nodes('.sc-43986a27-7.dBkaPT.dli-title-metadata')
  durations <- html_text(durations)
  durations <- unlist(lapply(durations, function(durations){
           str_extract(durations, "\\d+h( \\d+m)?|\\d+m|\\d+") |> str_extract("(?<=^.{4}).*")}))
  convert_to_minutes <- function(duration) {
  hours <- as.numeric(str_extract(duration, "\\d+(?=h)"))
  minutes <- as.numeric(str_extract(duration, "\\d+(?=m)"))
  total_minutes <- ifelse(is.na(hours),0, hours) * 60 + ifelse(is.na(minutes), 0, minutes)
  return(total_minutes)
}
  durations <- unlist(lapply(durations, convert_to_minutes))
  duration  <- append(duration, durations)
  
  ratings <- Html |> html_nodes(".sc-43986a27-1.fVmjht")
  ratings <- html_text(ratings)
  ratings <- unlist(lapply(ratings, function(ratings){
    str_sub(ratings, 1, 3)
  }))
  rating <- append(rating, as.numeric(ratings))
  
  votes <- Html |> html_nodes(".sc-53c98e73-0.kRnqtn")
  votes <- html_text(votes)
  extract_numeric <- function(string) {
  numeric_part <- str_extract(string, "\\d[0-9,]+")
  numeric_value <- as.numeric(gsub(",", "", numeric_part))
  return(numeric_value)
}
  votes <- unlist(lapply(votes, extract_numeric))
  vote  <- append(vote, votes)
}
movies  <- data.frame(title, release_year, duration, rating,vote)

head(movies)

                     title release_year duration rating  vote
1        Kuru Otlar Üstüne         2023      197    8.1  5058
2  Istanbul Için Son Çagri         2023       91    5.3  7329
3 Yedinci Kogustaki Mucize         2019      132    8.2 54151
4           Ölümlü Dünya 2         2023      117    7.5  3440
5                   Bihter         2023      113    3.6  3340
6             Ölümlü Dünya         2018      107    7.6 30258

Above, you can see the dataframe generated by scrapping the data from web.

a) Arranged by Rating

pre-processing

Code

movies <- movies %>% 
  arrange(desc(rating)) %>%
  mutate(ranking = c(1: length(title))) %>%
  select(ranking, everything())

Head

Top 5 movies based on user ratings are shown below.

Code

head(movies, n = 5L) %>% select(title, rating, vote, release_year)

                         title rating  vote release_year
1               Hababam Sinifi    9.2 42511         1975
2       CM101MMXI Fundamentals    9.1 46995         2013
3                   Tosun Pasa    8.9 24327         1976
4 Hababam Sinifi Sinifta Kaldi    8.9 24369         1975
5                Süt Kardesler    8.8 20884         1976

I don’t think there is someone haven’t watched one of these movies yet. I guess everybody would agree upon these 5. There are many great actors/actresses that make us laugh even today. The second one is actually not a movie though.

Tail

The bottom 5 is shown below.

Code

tail(movies, n = 5L) %>% select(title, rating, vote)

                             title rating  vote
466                 Cumali Ceber 2    1.2 10228
467                          Müjde    1.2  9920
468              15/07 Safak Vakti    1.2 20606
469 Cumali Ceber: Allah Seni Alsin    1.0 39266
470                           Reis    1.0 73972

Judging a movie before watching it is not appropriate. So, I have no comments about these movies.

b) My Favorite Ones

My favorite 3 movies, their rankings and ratings are listed below.

Code

movies %>% 
  filter(title == "Dag II" | title == "A.R.O.G" | title == "Kurtlar Vadisi: Gladio")

  ranking                  title release_year duration rating   vote
1      24                 Dag II         2016      135    8.2 109865
2     136                A.R.O.G         2008      127    7.3  44631
3     311 Kurtlar Vadisi: Gladio         2009       97    6.2   5288

c) Visualization

Yearly rating averages are visualized below. You can see that rating averages are decreasing as getting closer to today. However, one need consider the number of movies released since it is directly related with the rating averages.

Code

movies %>% 
  group_by(release_year) %>%
  summarize(yearly_average = mean(rating)) %>%
  ggplot(aes(x = release_year, y = yearly_average)) + geom_point() +
  ggtitle("Yearly Rating Averages") + theme_pander()

Below you can see that number of movies released are generally increased over the years.

Code

movies %>%
  group_by(release_year) %>%
  summarize(movie_number = n()) %>%
  ggplot(aes(x = release_year, y = movie_number)) + geom_point() + ggtitle("Number of Movies Over the Year") + theme_pander()

Box Plot

Code

movies %>%
  ggplot(aes(x = as.factor(release_year), y = rating)) +
  geom_boxplot() +
  theme_pander() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+ 
  xlab("release_year")

After 2003 number of movies increased dramatically and results relatively lower ratings.

Vote vs Rating

Code

movies %>%
  ggplot(aes(x = vote, rating)) + geom_point() + theme_pander()+
  ggtitle("Vote vs Rating")

Most of the votes within the range of 0 and 15000. In this range there is accumulation above 5.0 rating. As the number of votes increases rating is generally high. However further investigation with larger dataset is required.

Duration vs Rating

Code

movies %>%
  ggplot(aes(x = duration, rating)) + geom_point() + theme_pander()+
  ggtitle("Duration vs Rating")

Duration of the movies are accumulated between roughly 75-130 minutes. However in this range we can see both high and low ratings. Hence, there is not clear relationship between duration and rating.

Turkish Movies in IMDB Top 1000

Web-Scrapping

Code

knitr::opts_chunk$set(warning = FALSE)

url <- "https://m.imdb.com/search/title/?title_type=feature&groups=top_1000&country_of_origin=TR"

Html_ <- read_html(url)

title_top <- Html_ |> html_nodes('.ipc-title__text')
title_top <- html_text(title_top)
title_top <- tail(head(title_top,-1),-1)
title_top <- str_split(title_top, " ", n=2)
title_top <- unlist(lapply(title_top, function(x) {x[2]}))


release_year_top <- c()
release_years <- Html_ |> html_nodes(".sc-43986a27-7.dBkaPT.dli-title-metadata")
release_years <- html_text(release_years)
release_years  <- unlist(lapply(release_years, function(release_years){
           strtrim(release_years, 4)}))
release_year_top <- append(release_year_top, as.numeric(release_years))
                          
movies_top <- data_frame(title_top, release_year_top)

Joining with the original table

Code

left_join(movies_top, movies, by = c("title_top" = "title")) %>%
  select(-release_year_top) %>% select(ranking, everything()) %>%
  arrange(desc(rating))

# A tibble: 11 × 6
   ranking title_top                 release_year duration rating  vote
     <int> <chr>                            <dbl>    <dbl>  <dbl> <dbl>
 1      20 Ayla: The Daughter of War         2017      125    8.3 42989
 2      23 Yedinci Kogustaki Mucize          2019      132    8.2 54151
 3      27 Babam ve Oglum                    2005      108    8.2 91021
 4      31 Eskiya                            1996      128    8.1 71698
 5      32 Her Sey Çok Güzel Olacak          1998      107    8.1 27119
 6      37 Kis Uykusu                        2014      196    8   54631
 7      40 Nefes: Vatan Sagolsun             2009      128    8   35015
 8      38 Ahlat Agaci                       2018      188    8   26995
 9      42 G.O.R.A.                          2004      127    8   66027
10      44 Vizontele                         2001      110    8   38398
11      58 Bir Zamanlar Anadolu'da           2011      157    7.8 49348

The ranking shows the actual place of these movies in the movies dataset. Although they are in the top 1000 list of IMDB, they are not at the top of first dataframe. Hence, we can say that IMDB has some other parameters to order the movies in its top 1000 lists. These parameters can be awards they took, tickets sold etc.