Assignment 2

library(tidyverse) # for everything :)
library(stringr) # for string processing

# Define the URL to scrape
url <- ",2023-12-31&sort=moviemeter,desc&num_votes=2499,&countries=TR&count=250"  # Replace with the actual URL you want to scrape
url2 <- ",2010-12-31&sort=moviemeter,desc&num_votes=2500,&countries=TR&count=250"

# Use read_html() to read the HTML content from the URL
html_content <- read_html(url)
html_content2 <- read_html(url2)

# extract titles (movie names) ++
title_names <- html_content |> html_nodes('.ipc-title__text')
title_names <- html_text(title_names)
title_names <- tail(head(title_names,-1),-1)
title_names <- str_split(title_names, " ", n=2)
title_names <- unlist(lapply(title_names, function(x) {x[2]}))
title_names2 <- html_content2 |> html_nodes('.ipc-title__text')
title_names2 <- html_text(title_names2)
title_names2 <- tail(head(title_names2,-1),-1)
title_names2 <- str_split(title_names2, " ", n=2)
title_names2 <- unlist(lapply(title_names2, function(x) {x[2]}))

title <- unlist(c(title_names, title_names2))

# extract years ++
year <- html_content |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(1)')
year <- html_text(year)
convert_to_numeric <- function(x) {
  ifelse(grepl("k$", x), as.numeric(sub("k$", "", x)) * 1000, as.numeric(x))
year <- sapply(year, convert_to_numeric)
year2 <- html_content2 |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(1)')
year2 <- html_text(year2)
convert_to_numeric <- function(x) {
  ifelse(grepl("k$", x), as.numeric(sub("k$", "", x)) * 1000, as.numeric(x))
year2 <- sapply(year2, convert_to_numeric)

release <- unlist(c(year, year2))

# extract the vote counts ++
votes <- html_content |> html_nodes('.sc-53c98e73-0.kRnqtn')
votes <- html_text(votes)
votes <- as.numeric(gsub("[^0-9]", "", votes))
votes2 <- html_content2 |> html_nodes('.sc-53c98e73-0.kRnqtn')
votes2 <- html_text(votes2)
votes2 <- as.numeric(gsub("[^0-9]", "", votes2))

vote <- unlist(c(votes, votes2))

# extract the durations ++
duration <- html_content |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(2)')
duration <- html_text(duration)
hours <- as.numeric(str_extract(duration, "\\d+(?=h)"))
hours[] <- 0
minutes <- as.numeric(str_extract(duration, "\\d+(?=m)"))
minutes[] <- 0
total_minutes <- hours * 60 + minutes
duration2 <- html_content2 |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(2)')
duration2 <- html_text(duration2)
hours2 <- as.numeric(str_extract(duration2, "\\d+(?=h)"))
hours2[] <- 0
minutes2 <- as.numeric(str_extract(duration2, "\\d+(?=m)"))
minutes2[] <- 0
total_minutes2 <- hours2 * 60 + minutes2

total_time <- unlist(c(total_minutes, total_minutes2))

rating <- html_content |> html_nodes('.ratingGroup--imdb-rating')
rating <- html_text(rating)
rating <- str_extract(rating, "\\d+\\.\\d+") %>%as.numeric()
rating2 <- html_content2 |> html_nodes('.ratingGroup--imdb-rating')
rating2 <- html_text(rating2)
rating2 <- str_extract(rating2, "\\d+\\.\\d+") %>%as.numeric()

ratings <- unlist(c(rating, rating2))

#DataFrame <- merge(title, release,vote, total_time, ratings)

DataFrame <- bind_cols(
  Rating = ratings,
  Votes = vote,


DataFrame <- arrange(DataFrame,desc(Rating))
# A tibble: 6 × 5
  MovieTitle                   Release Rating Votes Duration
  <chr>                          <dbl>  <dbl> <dbl>    <dbl>
1 Hababam Sinifi                  1975    9.2 42513       87
2 CM101MMXI Fundamentals          2013    9.1 46996      139
3 Hababam Sinifi Sinifta Kaldi    1975    8.9 24370       95
4 Tosun Pasa                      1976    8.9 24329       90
5 Süt Kardesler                   1976    8.8 20888       80
6 Hababam Sinifi Uyaniyor         1976    8.7 20640       94
# A tibble: 6 × 5
  MovieTitle                     Release Rating Votes Duration
  <chr>                            <dbl>  <dbl> <dbl>    <dbl>
1 Enes Batur Gerçek Kahraman        2019    1.4  9513       99
2 15/07 Safak Vakti                 2021    1.2 20608       95
3 Müjde                             2022    1.2  9920       48
4 Cumali Ceber 2                    2018    1.2 10229      100
5 Reis                              2017    1   73973      108
6 Cumali Ceber: Allah Seni Alsin    2017    1   39267      100
