Assignment 2

Libraries:

Show the code
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Show the code
library(rvest)

Attaching package: 'rvest'

The following object is masked from 'package:readr':

    guess_encoding
Show the code
library(knitr)

URLs and Function for scraping and the Dataframe:

My dataframe from IMDB. (only showing 15)

Show the code
convert_time_updated <- function(time_string) {
  time_components <- strsplit(time_string, ":")[[1]]
  

  if (length(time_components) == 2) {
  
    hours <- as.numeric(time_components[1])
    minutes <- as.numeric(time_components[2])
    

    if (!any(is.na(c(hours, minutes)))) {

      total_minutes <- hours * 60 + minutes
      return(total_minutes)
    }
  }
  
  return(NA)
}

scrape_movie_info <- function(url) {
  page <- read_html(url)

  title <- page %>% html_elements(".ipc-title__text") %>% html_text() %>% 
    tail(-1) %>% head(-1) %>% str_split(" ", n = 2) %>% map_chr(2)

  year <- page %>% html_elements(".dli-title-metadata-item:nth-child(1)") %>% 
    html_text() %>% as.numeric()

  duration <- page %>% html_elements(".dli-title-metadata-item:nth-child(2)") %>% 
    html_text() %>% map_dbl(convert_time_updated)

  rating <- page %>% html_elements(".ratingGroup--imdb-rating") %>% 
    html_text() %>% str_extract("\\d+\\.\\d+") %>% as.numeric()

  votes <- page %>% html_elements(".kRnqtn") %>% 
    html_text() %>% gsub("\\D", "", .) %>% as.numeric()

  data.frame(Title = title, Year = year, Duration = duration, Rating = rating, Votes = votes)
}


url1 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2023-12-31&num_votes=2500,&country_of_origin=TR&count=250"
url2 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=,2009-12-31&num_votes=2500,&country_of_origin=TR&count=250"



movies1 <- scrape_movie_info(url1)
movies2 <- scrape_movie_info(url2)


movies <- rbind(movies1, movies2)


kable(head(movies, 15), caption = "IMDB Turkish Movies")
IMDB Turkish Movies
Title Year Duration Rating Votes
Kuru Otlar Üstüne 2023 NA 8.1 5087
Istanbul Için Son Çagri 2023 NA 5.3 7390
Yedinci Kogustaki Mucize 2019 NA 8.2 54168
Ölümlü Dünya 2 2023 NA 7.5 3481
Bihter 2023 NA 3.6 3356
Ölümlü Dünya 2018 NA 7.6 30269
Kis Uykusu 2014 NA 8.0 54646
Dag II 2016 NA 8.2 109868
Do Not Disturb 2023 NA 6.3 8778
Ayla: The Daughter of War 2017 NA 8.3 42991
Kurak Günler 2022 NA 7.6 11165
Ahlat Agaci 2018 NA 8.0 27015
Bir Zamanlar Anadolu’da 2011 NA 7.8 49363
Dabbe: Cin Çarpmasi 2013 NA 6.8 7109
Baskin 2015 NA 5.8 12107

Top and Bottom 5:

Show the code
movies <- movies %>% arrange(desc(Rating))


top_5_movies <- head(movies, 5)
bottom_5_movies <- tail(movies, 5)


(kable(top_5_movies, caption = "Top 5 Movies based on User Ratings"))
Top 5 Movies based on User Ratings
Title Year Duration Rating Votes
Hababam Sinifi 1975 NA 9.2 42512
CM101MMXI Fundamentals 2013 NA 9.1 46996
Tosun Pasa 1976 NA 8.9 24329
Hababam Sinifi Sinifta Kaldi 1975 NA 8.9 24369
Süt Kardesler 1976 NA 8.8 20889
Show the code
(kable(bottom_5_movies, caption = "Bottom 5 Movies based on User Ratings"))
Bottom 5 Movies based on User Ratings
Title Year Duration Rating Votes
466 Cumali Ceber 2 2018 NA 1.2 10230
467 Müjde 2022 NA 1.2 9919
468 15/07 Safak Vakti 2021 NA 1.2 20608
469 Cumali Ceber: Allah Seni Alsin 2017 NA 1.0 39269
470 Reis 2017 NA 1.0 73974

Visualizations:

Turkish movies are dying over years. Somebody help.

Show the code
movies$Year <- as.factor(movies$Year)


rating_averages <- movies %>%
  group_by(Year) %>%
  summarise(Avg_Rating = mean(Rating),
            Num_Movies = n())


ggplot(rating_averages, aes(x = Year, y = Avg_Rating, size = Num_Movies)) +
  geom_point() +
  labs(title = "Average Ratings of Turkish Movies Over the Years",
       x = "Year",
       y = "Average Rating") +
  theme_minimal()

Show the code
ggplot(rating_averages, aes(x = Year, y = Num_Movies)) +
  geom_point() +
  labs(title = "Number of Turkish Movies Over the Years",
       x = "Year",
       y = "Number of Movies") +
  theme_minimal()

Show the code
ggplot(movies, aes(x = Year, y = Rating)) +
  geom_boxplot() +
  labs(title = "Box Plots of Ratings for Turkish Movies Over the Years",
       x = "Year",
       y = "Rating") +
  theme_minimal()

IMDB Top 100 Visualizations:

Show the code
url <- "https://m.imdb.com/search/title/?title_type=feature&num_votes=2500,&groups=top_1000&country_of_origin=TR&count=250"


movies <- scrape_movie_info(url)


movies$Year <- as.factor(movies$Year)


rating_averages <- movies %>%
  group_by(Year) %>%
  summarise(Avg_Rating = mean(Rating),
            Num_Movies = n())


print(ggplot(rating_averages, aes(x = Year, y = Avg_Rating, size = Num_Movies)) +
  geom_point() +
  labs(title = "Average Ratings of Turkish Movies Over the Years",
       x = "Year",
       y = "Average Rating") +
  theme_minimal())

Show the code
ggplot(movies, aes(x = Year, y = Rating)) +
  geom_boxplot() +
  labs(title = "Box Plots of Ratings for Turkish Movies Over the Years",
       x = "Year",
       y = "Rating") +
  theme_minimal()

Back to top