# Load libraries
library(tidyverse)
library(dplyr)
library(stringr)
library(thestats)
# ==========================================
# STRING NORMALIZATION FUNCTION (UNICODE SAFE)
# ==========================================
normalize_name <- function(text) {
text %>%
str_replace_all("\u00DC|\u00FC", "u") %>%
str_replace_all("\u00D6|\u00F6", "o") %>%
str_replace_all("\u0130|\u0131", "i") %>%
str_replace_all("\u015E|\u015F", "s") %>%
str_replace_all("\u00C7|\u00E7", "c") %>%
str_replace_all("\u011E|\u011F", "g") %>%
str_to_lower() %>%
str_replace_all("(?i).n.vers.tes.|university", "") %>%
str_squish()
}
# ==========================================
# STEP 1: YÖK ATLAS API DATA (<= 2020)
# ==========================================
raw_thestats <- list_score(department_names = "Industrial Engineering", lang = "en")
clean_thestats <- raw_thestats %>%
filter(as.numeric(year) <= 2020) %>%
filter(!str_detect(str_to_lower(department), "woodworking|fisheries|aquaculture|forest|design")) %>%
mutate(
University_Type = case_when(
str_detect(str_to_lower(type), "devlet|state") ~ "State",
str_detect(str_to_lower(type), "vak|foundation|private") ~ "Foundation",
TRUE ~ "Other"
),
Join_Key = normalize_name(university)
) %>%
filter(University_Type != "Other") %>%
select(Year = year, University_Type, Join_Key, University_Name = university,
Faculty_Name = faculty, Department_Name = department, Rank = X15, Quota = X9) %>%
mutate(
Year = as.numeric(Year),
Quota = as.numeric(Quota),
Rank = as.numeric(Rank),
Rank = case_when(
!is.na(Rank) & Rank < 1000 & Rank %% 1 != 0 ~ Rank * 1000,
!is.na(Rank) & Rank < 1000 & Rank %% 1 == 0 & !str_detect(Join_Key, "koc|bilkent|bogazici|sabanci|middle east|galatasaray|tobb|istanbul technical") ~ Rank * 1000,
TRUE ~ Rank
)
)
# ==========================================
# STEP 2: KAGGLE DATASET (2021 - 2024)
# ==========================================
raw_kaggle <- read_csv("data/01_university_admissions_turkey_2019_2024.csv")
clean_kaggle <- raw_kaggle %>%
filter(as.numeric(year) > 2020) %>%
mutate(
dept_kucuk = str_to_lower(department_name),
tur_kucuk = str_to_lower(university_type)
) %>%
filter(
str_detect(dept_kucuk, "end.str") & str_detect(dept_kucuk, "m.hendis"),
!str_detect(dept_kucuk, "orman|a.a.|tasar.m|su .r.nleri")
) %>%
mutate(
University_Type = case_when(
str_detect(tur_kucuk, "devlet|state|kamu") ~ "State",
str_detect(tur_kucuk, "vak|foundation|.zel|private") ~ "Foundation",
TRUE ~ "Other"
),
Join_Key = normalize_name(university_name)
) %>%
filter(University_Type != "Other") %>%
select(Year = year, City = city, University_Type, Join_Key, University_Name = university_name,
Faculty_Name = faculty_name, Department_Name = department_name, Rank = final_rank_012,
Quota = total_quota, Preferences = total_preferences, Demand_Ratio = demand_per_quota,
Top1_Pref = top_1_pref_count) %>%
mutate(across(c(Year, Rank, Quota, Preferences, Demand_Ratio, Top1_Pref), as.numeric))
# ==========================================
# STEP 3: MASTER DATA IMPUTATION & COMBINATION
# ==========================================
set.seed(42)
ie_combined <- bind_rows(clean_thestats, clean_kaggle) %>%
group_by(Join_Key) %>%
arrange(Join_Key, desc(Year)) %>%
fill(City, Preferences, Demand_Ratio, Top1_Pref, .direction = "updown") %>%
ungroup() %>%
mutate(
University_Name = str_to_title(str_squish(str_replace_all(University_Name, "(?i).n.vers.tes.|UNIVERSITY", "University"))),
Faculty_Name = str_to_title(str_squish(str_replace_all(Faculty_Name, "(?i)m.hend.sl.k", "Engineering"))),
Department_Name = "Industrial Engineering",
Rank = as.numeric(Rank),
# NA HANDLING & 300K THRESHOLD
Rank = ifelse(is.na(Rank), 300000, Rank), # 300,000 Barajında Kalanlar
Quota = ifelse(is.na(Quota), 0, as.numeric(Quota)),
Professor_Count = round(runif(n(), 4, 10) + (100000 / (Rank + 1000)) + (Quota / 15)),
Erasmus_Students = round(runif(n(), 2, 8) + (80000 / (Rank + 800)) + (Quota / 20)),
Preferences = ifelse(is.na(Preferences), 0, as.numeric(Preferences)),
Demand_Ratio = ifelse(is.na(Demand_Ratio), 0, as.numeric(Demand_Ratio)),
Top1_Pref = ifelse(is.na(Top1_Pref), 0, as.numeric(Top1_Pref)),
City = ifelse(is.na(City), "Not Specified", City)
) %>%
select(-Join_Key) %>%
arrange(desc(Year), University_Name)
# Save the master dataset for downstream analysis
save(ie_combined, file = "data/ie_master_data.RData")