Assignment 2

library(tidyverse) # for everything :)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(rvest) # for HTML scraping


Attaching package: 'rvest'

The following object is masked from 'package:readr':

    guess_encoding

library(stringr) # for string processing

# Define the URL to scrape
url <- "https://m.imdb.com/search/title/?title_type=feature&release_date=2011-01-01,2023-12-31&sort=moviemeter,desc&num_votes=2499,&countries=TR&count=250"  # Replace with the actual URL you want to scrape
url2 <- "https://m.imdb.com/search/title/?title_type=feature&release_date=,2010-12-31&sort=moviemeter,desc&num_votes=2500,&countries=TR&count=250"




# Use read_html() to read the HTML content from the URL
html_content <- read_html(url)
html_content2 <- read_html(url2)

 
# extract titles (movie names) ++
title_names <- html_content |> html_nodes('.ipc-title__text')
title_names <- html_text(title_names)
title_names <- tail(head(title_names,-1),-1)
title_names <- str_split(title_names, " ", n=2)
title_names <- unlist(lapply(title_names, function(x) {x[2]}))
#==================================================================
title_names2 <- html_content2 |> html_nodes('.ipc-title__text')
title_names2 <- html_text(title_names2)
title_names2 <- tail(head(title_names2,-1),-1)
title_names2 <- str_split(title_names2, " ", n=2)
title_names2 <- unlist(lapply(title_names2, function(x) {x[2]}))

title <- unlist(c(title_names, title_names2))



# extract years ++
year <- html_content |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(1)')
year <- html_text(year)
convert_to_numeric <- function(x) {
  ifelse(grepl("k$", x), as.numeric(sub("k$", "", x)) * 1000, as.numeric(x))
}
year <- sapply(year, convert_to_numeric)
#=================================================================================================
year2 <- html_content2 |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(1)')
year2 <- html_text(year2)
convert_to_numeric <- function(x) {
  ifelse(grepl("k$", x), as.numeric(sub("k$", "", x)) * 1000, as.numeric(x))
}
year2 <- sapply(year2, convert_to_numeric)

release <- unlist(c(year, year2))

# extract the vote counts ++
votes <- html_content |> html_nodes('.sc-53c98e73-0.kRnqtn')
votes <- html_text(votes)
votes <- as.numeric(gsub("[^0-9]", "", votes))
#=================================================================================================
votes2 <- html_content2 |> html_nodes('.sc-53c98e73-0.kRnqtn')
votes2 <- html_text(votes2)
votes2 <- as.numeric(gsub("[^0-9]", "", votes2))


vote <- unlist(c(votes, votes2))


# extract the durations ++
duration <- html_content |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(2)')
duration <- html_text(duration)
hours <- as.numeric(str_extract(duration, "\\d+(?=h)"))
hours[is.na(hours)] <- 0
minutes <- as.numeric(str_extract(duration, "\\d+(?=m)"))
minutes[is.na(minutes)] <- 0
total_minutes <- hours * 60 + minutes
#=================================================================================================
duration2 <- html_content2 |> html_nodes('.sc-43986a27-8.jHYIIK.dli-title-metadata-item:nth-child(2)')
duration2 <- html_text(duration2)
hours2 <- as.numeric(str_extract(duration2, "\\d+(?=h)"))
hours2[is.na(hours2)] <- 0
minutes2 <- as.numeric(str_extract(duration2, "\\d+(?=m)"))
minutes2[is.na(minutes2)] <- 0
total_minutes2 <- hours2 * 60 + minutes2


total_time <- unlist(c(total_minutes, total_minutes2))



rating <- html_content |> html_nodes('.ratingGroup--imdb-rating')
rating <- html_text(rating)
rating <- str_extract(rating, "\\d+\\.\\d+") %>%as.numeric()
#=================================================================================================
rating2 <- html_content2 |> html_nodes('.ratingGroup--imdb-rating')
rating2 <- html_text(rating2)
rating2 <- str_extract(rating2, "\\d+\\.\\d+") %>%as.numeric()


ratings <- unlist(c(rating, rating2))

#DataFrame <- merge(title, release,vote, total_time, ratings)


DataFrame <- bind_cols(
  
  MovieTitle=title,
  Release=release,
  Rating = ratings,
  Votes = vote,
  Duration=total_time
  
)

Ques

DataFrame <- arrange(DataFrame,desc(Rating))
print(head(DataFrame))

# A tibble: 6 × 5
  MovieTitle                   Release Rating Votes Duration
  <chr>                          <dbl>  <dbl> <dbl>    <dbl>
1 Hababam Sinifi                  1975    9.2 42513       87
2 CM101MMXI Fundamentals          2013    9.1 46996      139
3 Hababam Sinifi Sinifta Kaldi    1975    8.9 24370       95
4 Tosun Pasa                      1976    8.9 24329       90
5 Süt Kardesler                   1976    8.8 20888       80
6 Hababam Sinifi Uyaniyor         1976    8.7 20640       94

print(tail(DataFrame))

# A tibble: 6 × 5
  MovieTitle                     Release Rating Votes Duration
  <chr>                            <dbl>  <dbl> <dbl>    <dbl>
1 Enes Batur Gerçek Kahraman        2019    1.4  9513       99
2 15/07 Safak Vakti                 2021    1.2 20608       95
3 Müjde                             2022    1.2  9920       48
4 Cumali Ceber 2                    2018    1.2 10229      100
5 Reis                              2017    1   73973      108
6 Cumali Ceber: Allah Seni Alsin    2017    1   39267      100