Assignment 2

1) Filtering Movies

library(tidyverse)

Warning: package 'tidyverse' was built under R version 4.3.2

Warning: package 'ggplot2' was built under R version 4.3.2

Warning: package 'tidyr' was built under R version 4.3.2

Warning: package 'readr' was built under R version 4.3.2

Warning: package 'purrr' was built under R version 4.3.2

Warning: package 'dplyr' was built under R version 4.3.2

Warning: package 'forcats' was built under R version 4.3.2

Warning: package 'lubridate' was built under R version 4.3.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(stringr)
library(rvest)

Warning: package 'rvest' was built under R version 4.3.2


Attaching package: 'rvest'

The following object is masked from 'package:readr':

    guess_encoding

library(ggplot2)
library(knitr)

url_vector_1 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&num_votes=2500,&country_of_origin=TR&count=250"
url_vector_2 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&num_votes=2500,&country_of_origin=TR&count=250"

2) Creating Data Frame

html_1 <- read_html(url_vector_1)
html_2 <- read_html(url_vector_2)

#title
title_names_1 <- html_1 |> html_nodes(".ipc-title__text")
title_names_1 <- html_text(title_names_1)
title_names_1 <- tail(head(title_names_1,-1),-1)
title_names_1 <- str_split(title_names_1, " ", n=2)
title_names_1 <- unlist(lapply(title_names_1, function(x) {x[2]}))

title_names_2 <- html_2 |> html_nodes(".ipc-title__text")
title_names_2 <- html_text(title_names_2)
title_names_2 <- tail(head(title_names_2,-1),-1)
title_names_2 <- str_split(title_names_2, " ", n=2)
title_names_2 <- unlist(lapply(title_names_2, function(x) {x[2]}))

title <- c(title_names_1, title_names_2)

#year
year_1 <- html_1 |> html_nodes(".sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(1)")
year_1 <- html_text(year_1)
year_1 <- as.numeric(year_1)

year_2 <- html_2 |> html_nodes(".sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(1)")
year_2 <- html_text(year_2)
year_2 <- as.numeric(year_2)

year <- c(year_1, year_2)

# vote
vote_1 <- html_1 |> html_nodes(".sc-53c98e73-0.kRnqtn")
vote_1 <- html_text(vote_1)
vote_1  <- as.numeric(gsub("\\D", "", vote_1 ))

vote_2 <- html_2 |> html_nodes(".kRnqtn")
vote_2 <- html_text(vote_2)
vote_2  <- as.numeric(gsub("\\D", "", vote_2 ))

vote <- c(vote_1, vote_2)

#ratings
rating_1 <- html_1 |> html_nodes(".ipc-rating-star.ipc-rating-star--base.ipc-rating-star--imdb.sc-9ab53865-1.iXEijC.ratingGroup--imdb-rating")
rating_1 <- html_text(rating_1)
rating_1  <- as.numeric(str_sub(rating_1, 1, 3))

rating_2 <- html_2 |> html_nodes(".ipc-rating-star.ipc-rating-star--base.ipc-rating-star--imdb.sc-9ab53865-1.iXEijC.ratingGroup--imdb-rating")
rating_2 <- html_text(rating_2)
rating_2  <- as.numeric(str_sub(rating_2, 1, 3))

rating <- c(rating_1, rating_2)

#duration (I got help from Chatgpt)
duration_1 <- html_1 |> html_nodes (".sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(2)")
duration_1 <- html_text(duration_1)

duration_2 <- html_2 |> html_nodes (".sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(2)")
duration_2 <- html_text(duration_2)

convert_time_to_minutes <- function(time_strings) {
  total_minutes <- numeric(length(time_strings))
  
  for (i in seq_along(time_strings)) {
    parts <- strsplit(time_strings[i], "h|m")[[1]]
    hours <- as.numeric(parts[1])
    minutes <- as.numeric(parts[2])
    
    total_minutes[i] <- hours * 60 + minutes
  }
  
  return(total_minutes)
}
total_minutes_result_1 <- convert_time_to_minutes(duration_1)
total_minutes_result_2 <- convert_time_to_minutes(duration_2)

duration <- c(total_minutes_result_1, total_minutes_result_2)

movies <- data.frame(title, year, duration, rating, vote)

3) Exploratory Data Analysis

a) Top and Bottom 5 Movies by Ranking

movies <- data.frame(title, year, duration, rating, vote)

movies <- arrange(movies, desc(rating))

top_5_movies <- head(movies, 5)

last_5_movies <- tail(movies, 5)

kable(rbind(top_5_movies, last_5_movies), caption = "Best and Worst 5 Movies")

Best and Worst 5 Movies
	title	year	duration	rating	vote
1	Hababam Sinifi	1975	87	9.2	42515
2	CM101MMXI Fundamentals	2013	139	9.1	46998
3	Tosun Pasa	1976	90	8.9	24330
4	Hababam Sinifi Sinifta Kaldi	1975	95	8.9	24370
5	Süt Kardesler	1976	80	8.8	20890
466	Cumali Ceber 2	2018	100	1.2	10230
467	Müjde	2022	NA	1.2	9920
468	15/07 Safak Vakti	2021	95	1.2	20608
469	Cumali Ceber: Allah Seni Alsin	2017	100	1.0	39269
470	Reis	2017	108	1.0	73975

I watched the first 5 movies on the list. 4 of these are Yeşilçam films that have not lost their importance for years and are known to everyone, therefore they deserve the points they received, but at the same time, films that have a place in world cinema and can compete with these films should also be among these Yeşilçam films. In my opinion, there are other films that should be at least as high as the Yeşilçam films on this list.

I can’t comment because I haven’t watched the last 5 movies.

b) My Favorite Movies

fav_movies <- c("Ise Yarar Bir Sey", "Ölümlü Dünya 2", "Kurak Günler")

fav_movies_data <- movies[movies$title %in% fav_movies, ]

kable(rbind(fav_movies_data), caption = "My Favorite Movies")

My Favorite Movies
	title	year	duration	rating	vote
80	Kurak Günler	2022	129	7.6	11179
85	Ise Yarar Bir Sey	2017	104	7.6	5512
94	Ölümlü Dünya 2	2023	117	7.5	3532

c) Visualization

by_year <- movies %>%
         group_by(year) %>%
         summarise(avg_rating = mean(rating))

ggplot(by_year, aes(x= year, y= avg_rating)) + geom_point()

As can be seen from the plot, average ratings decrease as the years increase.

ggplot(movies, aes(x = year, y = rating, group = year)) + geom_boxplot()

d) Do you believe there is a relationship between the number of votes a movie received and its rating? Investigate the correlation between Votes and Ratings.

cor(movies$vote, movies$rating)

[1] 0.1309764

e)Do you believe there is a relationship between a movie's duration and its rating? Investigate the correlation between Duration and Ratings.

cor(movies$duration, movies$rating)

[1] NA

4) Turkish Movies in the Top 1000 on IMDb

url_vector_3 <- "https://m.imdb.com/search/title/?title_type=feature&groups=top_1000&country_of_origin=TR&count=250"


html_3 <- read_html(url_vector_3)

#title
title_names_3 <- html_3 |> html_nodes(".ipc-title__text")
title_names_3 <- html_text(title_names_3)
title_names_3 <- tail(head(title_names_3,-1),-1)
title_names_3 <- str_split(title_names_3, " ", n=2)
title_names_3 <- unlist(lapply(title_names_3, function(x) {x[2]}))

title <- title_names_3

#year

year_3 <- html_3 |> html_nodes(".sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(1)")
year_3 <- html_text(year_3)
year_3 <- as.numeric(year_3)

year <- year_3

top1000_df <- data.frame(title, year)

kable(rbind(top1000_df), caption= "Turkish Movies in IMDB Top 1000")

Turkish Movies in IMDB Top 1000
title	year
Yedinci Kogustaki Mucize	2019
Kis Uykusu	2014
Nefes: Vatan Sagolsun	2009
Ayla: The Daughter of War	2017
Babam ve Oglum	2005
Ahlat Agaci	2018
Bir Zamanlar Anadolu’da	2011
Eskiya	1996
G.O.R.A.	2004
Vizontele	2001
Her Sey Çok Güzel Olacak	1998

final_df <- movies %>%
  inner_join(top1000_df, by = c("title", "year"))

kable(rbind(final_df), caption= "Turkish Movies in IMDB Top 1000")

Turkish Movies in IMDB Top 1000
title	year	duration	rating	vote
Ayla: The Daughter of War	2017	125	8.3	42997
Yedinci Kogustaki Mucize	2019	132	8.2	54182
Babam ve Oglum	2005	108	8.2	91046
Eskiya	1996	128	8.1	71704
Her Sey Çok Güzel Olacak	1998	107	8.1	27124
Kis Uykusu	2014	196	8.0	54654
Ahlat Agaci	2018	188	8.0	27022
Nefes: Vatan Sagolsun	2009	128	8.0	35026
G.O.R.A.	2004	127	8.0	66037
Vizontele	2001	110	8.0	38407
Bir Zamanlar Anadolu’da	2011	157	7.8	49374

final_df <- arrange(final_df, desc(rating))
kable(rbind(final_df), caption= "Turkish Movies in IMDB Top 1000 by Ranked")

Turkish Movies in IMDB Top 1000 by Ranked
title	year	duration	rating	vote
Ayla: The Daughter of War	2017	125	8.3	42997
Yedinci Kogustaki Mucize	2019	132	8.2	54182
Babam ve Oglum	2005	108	8.2	91046
Eskiya	1996	128	8.1	71704
Her Sey Çok Güzel Olacak	1998	107	8.1	27124
Kis Uykusu	2014	196	8.0	54654
Ahlat Agaci	2018	188	8.0	27022
Nefes: Vatan Sagolsun	2009	128	8.0	35026
G.O.R.A.	2004	127	8.0	66037
Vizontele	2001	110	8.0	38407
Bir Zamanlar Anadolu’da	2011	157	7.8	49374

We see that this is not the same as the first dataframe. When I examined the first dataframe, only one of the first 11 movies was released after 2000. In this dataframe, only 2 dataframes were released before 2000. In other words, IMDb may consider the year parameter more than ratings in its rankings.