5 OHE cleaning, linear reg, and XGBoost

(with dataset cleaning in the beginning)

Published

June 9, 2025

6 Data Type Cleaning & Unification

6.1 Load cleaned data set with dummy coding (ont-hot encoded) variables:

youth_orig <- read_csv("../../dssg-2025-mentor-canada/Data/encodedselectall.csv")

head(youth_orig)

Outcome variables (11 expected in total):

Q12. High school GED

QS1_19_HIGHSCHOOL
QS1_20_HIGHSCHOOL

Q44. Help-seeking

QS4_8_HELPSEEKING1_1_1, QS4_8_HELPSEEKING1_2_2, QS4_8_HELPSEEKING1_3_3, QS4_8_HELPSEEKING1_4_4, QS4_8_HELPSEEKING1_5_5, QS4_8_HELPSEEKING1_6_6, QS4_8_HELPSEEKING1_7_7, QS4_8_HELPSEEKING1_8_8, QS4_8_HELPSEEKING1_9_9, QS4_8_HELPSEEKING1_10_10

Q13. Further education/training -> counts at each level

QS1_21_FURTHEDUCA

Q13a. Higher education (controlling for age)

QS1_22_HIGHESTEDU
QS1_23_YEARCOMPLE

Q45. Mental health

QS4_9_MENTALHEALTH

Q14. Employment status

QS1_25_EMPLOYMENT
- We added the continuous yearly income variable: QS1_28_EMPLOYMENT_calculated

Q46. Mental well-being

QS4_10_MENTALWELLBE1_1_1, QS4_10_MENTALWELLBE1_2_2, QS4_10_MENTALWELLBE1_3_3, QS4_10_MENTALWELLBE1_4_4, QS4_10_MENTALWELLBE1_5_5, QS4_10_MENTALWELLBE1_6_6, QS4_10_MENTALWELLBE1_7_7

Q41. Career planning

QS4_3_CAREERPLANNIN1_1_1 QS4_3_CAREERPLANNIN1_2_2 QS4_3_CAREERPLANNIN1_3_3 QS4_3_CAREERPLANNIN1_4_4 QS4_3_CAREERPLANNIN1_5_5 QS4_3_CAREERPLANNIN1_6_6 QS4_3_CAREERPLANNIN1_7_7 QS4_3_CAREERPLANNIN1_8_8

Q27. Belonging

QS4_11_BELONGING

Q43. Social capital

QS4_7_SOCIALCAPITAL1_1_1, QS4_7_SOCIALCAPITAL1_2_2, QS4_7_SOCIALCAPITAL1_3_3, QS4_7_SOCIALCAPITAL1_4_4

Q50. Volunteering

QS4_14_FORMALVOL
QS4_15_TIMEIFFOR1
- Total hour
QS4_15_TIMEIFFOR2
- Hours per month
QS4_15_TIMEIFFOR3
- Hours per week
QS4_15_TIMEIFFOR4
- Hours per day
QS4_16_FORMALVOL
- Volunteered as teacher/mentor/educator in past 12 months.

high_school_ged <- c("QS1_19_HIGHSCHOOL", "QS1_20_HIGHSCHOOL")

further_educ <- c("QS1_21_FURTHEDUCA", "QS1_22_HIGHESTEDU", "QS1_23_YEARCOMPLE")

employment_status <- "QS1_25_EMPLOYMENT"
yearly_income <- "QS1_28_EMPLOYMENT_calculated"
career_planning <- c("QS4_3_CAREERPLANNIN1_1_1", "QS4_3_CAREERPLANNIN1_2_2", "QS4_3_CAREERPLANNIN1_3_3", "QS4_3_CAREERPLANNIN1_4_4", "QS4_3_CAREERPLANNIN1_5_5", "QS4_3_CAREERPLANNIN1_6_6", "QS4_3_CAREERPLANNIN1_7_7", "QS4_3_CAREERPLANNIN1_8_8")

social_capital <- c("QS4_7_SOCIALCAPITAL1_1_1", "QS4_7_SOCIALCAPITAL1_2_2", "QS4_7_SOCIALCAPITAL1_3_3", "QS4_7_SOCIALCAPITAL1_4_4")

help_seeking <- c("QS4_8_HELPSEEKING1_1_1", "QS4_8_HELPSEEKING1_2_2", "QS4_8_HELPSEEKING1_3_3", "QS4_8_HELPSEEKING1_4_4", "QS4_8_HELPSEEKING1_5_5", "QS4_8_HELPSEEKING1_6_6", "QS4_8_HELPSEEKING1_7_7", "QS4_8_HELPSEEKING1_8_8", "QS4_8_HELPSEEKING1_9_9", "QS4_8_HELPSEEKING1_10_10")
mental_health <- "QS4_9_MENTALHEALTH"
well_being <- c("QS4_10_MENTALWELLBE1_1_1", "QS4_10_MENTALWELLBE1_2_2", "QS4_10_MENTALWELLBE1_3_3", "QS4_10_MENTALWELLBE1_4_4", "QS4_10_MENTALWELLBE1_5_5", "QS4_10_MENTALWELLBE1_6_6", "QS4_10_MENTALWELLBE1_7_7")

belonging <- "QS4_11_BELONGING"

volunteering <- c("QS4_14_FORMALVOL", "QS4_15_TIMEIFFOR1", "QS4_15_TIMEIFFOR2", "QS4_15_TIMEIFFOR3", "QS4_15_TIMEIFFOR4", "QS4_16_FORMALVOL")


youth_iv <- youth_orig |>
            select(-all_of(c(high_school_ged,
                                further_educ,
                           employment_status, 
                               yearly_income,
                             career_planning,
                              social_capital, 
                                help_seeking, 
                               mental_health,
                                  well_being, 
                                   belonging,
                                volunteering)))

youth_dv <- youth_orig |>
            select(all_of(c(high_school_ged,
                               further_educ,
                          employment_status, 
                              yearly_income,
                            career_planning,
                             social_capital, 
                               help_seeking, 
                              mental_health,
                                 well_being, 
                                  belonging,
                               volunteering)))

6.1.0.1 Ensure all postal code are in proper upper cases:

youth_orig$geo_postcode_fsa <- toupper(youth_orig$geo_postcode_fsa)

6.1.1 Remove pre-processed column (binarized) by previous researchers

youth <- youth_orig |>
select(-c(QS1_8_NEWCOMERYEAR_cat:Week_income)) 

youth <- youth |>
select(-c(QS2_16_FORMAT_any:total_yearly_income))

6.1.2 Count proportion of empty entries per variable:

n <- youth |> nrow()

na_count_table <- youth |> summarize_all(~sum(is.na(.))) |>
                    pivot_longer(cols = everything(), names_to = "columns", values_to = "na_count") |>
                    mutate(na_percent = na_count / n * 100,
                           non_na_entries = n - na_count)

na_count_70_percent <- na_count_table |> filter(na_percent >= 70)
head(na_count_70_percent)

# A tibble: 6 × 4
  columns            na_count na_percent non_na_entries
  <chr>                 <int>      <dbl>          <int>
1 geo_postcode_fsa       2250       79.3            588
2 QS1_5_INDIGENOUSHS     2544       89.6            294
3 QS1_8_NEWCOMERYEAR     2421       85.3            417
4 QS1_17_INCARE          2752       97.0             86
5 QS1_26_EMPLOYMENT      2437       85.9            401
6 QS1_27_PLANNINGRE      2437       85.9            401

176 variables have >= 70% empty NA entries.

6.1.2.1 Identify highly sparse column (number of entries < 30):

cols_less_than_30_entires <- na_count_table |>
                             filter(non_na_entries < 30)
print(cols_less_than_30_entires, n = nrow(cols_less_than_30_entires))
cols_less_than_30_entires$columns

cols_less_than_90_percent <- na_count_table |>
                             filter(na_percent >= 90)
print(cols_less_than_90_percent, n = nrow(cols_less_than_90_percent))
cols_less_than_90_percent$columns

6.1.3 Remove Irrelevant Columns:

youth <- youth |>

# Remove text columns:
select(-c("QS2_25_YOUTHINIT1", "QS2_29_MATCHCRITERIA_O", "QS2_39_2_Other",  
          "QS4_23_PASTFORMA1", "QS2_39_1_Other", "QS2_39_3_Other")) |> 

# Remove columns with <30 valid entries:
select(-any_of(cols_less_than_30_entires$columns)) |>

# Remove repeated income columns:
select(-c("QS1_29_EMPLOYMENT", "QS1_30_EMPLOYMENT1", "QS1_30_EMPLOYMENT2", 
          "QS1_31_EMPLOYMENT", "QS1_32_WEEKLY")) |> 

# These columns were selected out in other files, so any_of() is used:
select(-any_of(c("QS2_15_RELATIONSHIP1", "QS2_5_Other", "QS2_17_TYPE_1_Other"))) |> 

# Take out postal code--it will be useful only when building geospatial plot:
select(-geo_postcode_fsa) |> 

# Text column: Name of organization with with respondent served as a mentor
select(-c("QS4_19_CURRENTME1", "QS4_19_CURRENTME2", "QS4_19_CURRENTME3")) |>

# Text column: name of mentor organization, else unsure (QS4_23_PASTFORMA2) or prefer not say (QS4_23_PASTFORMA3):
select(-c('QS4_23_PASTFORMA2', 'QS4_23_PASTFORMA3')) |> 

# This is an 'other' column for text written response:
select(-any_of(c("QS2_34_SUPPORTS1_10_10"))) |> 

# text entry:
select(-any_of(c("QS2_26_INITIATIONEV"))) |> 

# whether respondent remembered name of organization:
select(-any_of(c("QS2_27_MENTORPROGRA1")))

6.1.3.1 Remove `Prefer not to answer` / `Don't know` responses in the following ordinal items:

QS4_9_MENTALHEALTH
- "Don’t know"
- "Prefer not to answer"
QS4_11_BELONGING
- "Don’t know"
- "Prefer not to answer"
QS4_20_MENTEEAGE
- "Unsure"
- "Prefer not to say"

nrow(youth) # 2838
youth <- youth |>
  filter(!QS4_9_MENTALHEALTH %in% c("Don’t know", "Prefer not to answer"), # ordinal
         !QS4_11_BELONGING %in% c("Don’t know", "Prefer not to answer"), # ordinal
         !QS4_20_MENTEEAGE %in% c("Unsure", "Prefer not to say")) # ordinal
nrow(youth) # 2396

# Responses containing "Unsure"
unsure_counts <- tidy(colSums(youth == "Unsure", na.rm = TRUE))|>
                  mutate(is_unsure = ifelse(x >= 1, TRUE, FALSE)) |>
                    filter(`is_unsure` == "TRUE") 
# Responses containing "Prefer not to say"
pns_counts <- tidy(colSums(youth == "Prefer not to say", na.rm = TRUE))|>
                  mutate(is_unsure = ifelse(x >= 1, TRUE, FALSE)) |>
                    filter(`is_unsure`== "TRUE")

head(unsure_counts)

# A tibble: 6 × 3
  names                 x is_unsure
  <chr>             <dbl> <lgl>    
1 QS1_2_PROV            6 TRUE     
2 QS1_4_INDIGENOUS    101 TRUE     
3 QS1_12_DISABIL      133 TRUE     
4 QS1_13_DISABIL       43 TRUE     
5 QS1_22_HIGHESTEDU   110 TRUE     
6 QS2_1_MEANINGFULP   254 TRUE

head(pns_counts)

# A tibble: 6 × 3
  names                 x is_unsure
  <chr>             <dbl> <lgl>    
1 QS1_2_PROV           14 TRUE     
2 QS1_4_INDIGENOUS     75 TRUE     
3 QS1_11_SEXUALO       91 TRUE     
4 QS1_12_DISABIL       40 TRUE     
5 QS1_13_DISABIL       11 TRUE     
6 QS2_1_MEANINGFULP    29 TRUE

Career Planning Variables contain a huge number of ‘Unsure’ responses:

It would be more appropriate to return to this variable later and remove these observations.

To work with the rest of the data, it may serve the analysis better if QS4_3_CAREERPLANNIN1_1_1 to QS4_3_CAREERPLANNIN1_8_8 will be removed.

QS4_3_CAREERPLANNIN1_1_1: 486 Unsure responses

QS4_3_CAREERPLANNIN1_2_2: 413

QS4_3_CAREERPLANNIN1_3_3: 508

QS4_3_CAREERPLANNIN1_4_4: 463

QS4_3_CAREERPLANNIN1_5_5: 591

QS4_3_CAREERPLANNIN1_6_6: 354

QS4_3_CAREERPLANNIN1_7_7: 512

QS4_3_CAREERPLANNIN1_8_8: 717

Other variables containing Unsure responses:

Given the number of unsure responses in these variables are small relative to the number of total observation (~2,900), these their corresponding observation would be remove. The resultant dimension of the train ***** data will be examined.

6.2 Identify continuous metrics columns:

QS1_14_DISABIL (9b: At what age did disability / conditions emerge.)
QS1_15_DISABIL (9b1: How many years have the disability affected performance)
Education levels (years) being treated as continuous due to > 7 levels.
- QS1_18_PARENTEDUC1
  
  QS1_18_PARENTEDUC2
  
  QS1_20_HIGHSCHOOL
QS1_23_YEARCOMPLE (In what year completed higher education)
Employment wage / salary:
- QS1_29_EMPLOYMENT (removed)
- QS1_30_EMPLOYMENT1 (removed)
- QS1_30_EMPLOYMENT2 (removed)
- QS1_31_EMPLOYMENT (removed)
- QS1_32_WEEKLY (removed)
- Month_income
- Semimonth_income
- Biweek_income
- Week_income
- QS1_28_EMPLOYMENT_calculated
Volunteer hours:
- QS4_15_TIMEIFFOR1 (Total hour)
- QS4_15_TIMEIFFOR2 (Hours per month)
- QS4_15_TIMEIFFOR3 (Hours per week)
- QS4_15_TIMEIFFOR4 (Hours per day)
Number of mentors:
- QS2_10_NUMBEROFME

# metric variables, not including outcome `QS1_25_EMPLOYMENT`
metric_vars <- c("QS1_1_AGE", "QS1_8_NEWCOMERYEAR", "QS1_14_DISABIL", "QS1_15_DISABIL", "QS1_20_HIGHSCHOOL", "QS1_23_YEARCOMPLE", "QS2_10_NUMBEROFME", "QS4_15_TIMEIFFOR1", "QS4_15_TIMEIFFOR2", "QS4_15_TIMEIFFOR3", "QS4_15_TIMEIFFOR4",  "QS4_12_TRUST1_1_1", "QS4_12_TRUST1_2_2", "QS4_12_TRUST1_3_3", "QS4_12_TRUST1_4_4", "QS4_12_TRUST1_5_5", "QS4_14_FORMALVOL")

6.3 Convert all categorical columns into factor data type

Identify categorical columns:

cate_vars <- c("QS1_2_PROV", "QS1_3_COMMUNITYTYPE", "QS1_4_INDIGENOUS", 
               "QS1_5_INDIGENOUSHS", 
               "QS1_7_NEWCOMER", "QS1_10_TRANSUM", "QS1_11_SEXUALO", "QS1_12_DISABIL", "QS1_13_DISABIL", 
               "QS1_17_INCARE",  
               "QS1_19_HIGHSCHOOL", "QS1_21_FURTHEDUCA", "QS1_25_EMPLOYMENT", 
              "QS1_27_PLANNINGRE", 
              "QS1_28_EMPLOYMENT", "QS2_1_MEANINGFULP", "QS2_2_MEANINGFULP", "QS2_3_PRESENCEOFM", "QS2_4_MENTOR61FOR", 
              "QS2_5_MENTOR611PR", 
               "QS2_7_MENTOR611SE", "QS2_8_UNMETNEED61", "QS2_9_PRESENCEOFA", "QS2_11_MENTOR1218",  "QS2_12_UNMETNEED1", "QS2_16_FORMAT_1", "QS2_17_TYPE_1", "QS2_18_LOCATION_1", 
               "QS2_22_GEOLOCATI1", 
               "QS2_16_FORMAT_2", "QS2_17_TYPE_2", "QS2_18_LOCATION_2", 
               "QS2_22_GEOLOCATI2", 
               "QS2_16_FORMAT_3", "QS2_17_TYPE_3", "QS2_18_LOCATION_3", 
               "QS2_22_GEOLOCATI3",
               "QS2_23_MOSTMEANI", "QS2_24_MENTORAGE", 
              # "QS2_26_INITIATIONEV", # *Removed due to having 99.7% NA entries.
               
              # "QS2_27_MENTORPROGRA1", # Removed because it asks for 'recall of the name of mentor organization'
              "QS2_28_MATCHCHOICE", "Match_GenderIdentity", "Match_Ethnicity", "Match_CulturalBackground", "Match_ScheduleAvailability", "Match_Interests", "Match_Goals", "Match_Personalities", "Match_LifeStruggles", "Transition_School", "Transition_NewSchool", "Transition_NewCommunity", "Transition_GettingDriversLicense", "Transition_JobAspirations", "Transition_GettingFirstJob", "Transition_ApplyingToTradeSchool-Collge-Uni", "Transition_IndependenceFromGuardian", "Transition_FundingForTradeSchool-Collge-Uni", "Transition_NoneOfAbove",  "QS2_33_TRANSITIONS1_13_13", "QS2_33_TRANSITIONS1_14_14",  "QS3_4_LIFEEVENTS1_1_1", "QS3_4_LIFEEVENTS1_2_2", "QS3_4_LIFEEVENTS1_3_3", "QS3_4_LIFEEVENTS1_4_4", "QS3_4_LIFEEVENTS1_5_5", "QS3_4_LIFEEVENTS1_6_6", "QS3_4_LIFEEVENTS1_7_7", "QS3_4_LIFEEVENTS1_8_8", "QS3_4_LIFEEVENTS1_9_9", "QS3_4_LIFEEVENTS1_10_10", "QS3_4_LIFEEVENTS1_11_11", "QS3_4_LIFEEVENTS1_12_12", "QS3_4_LIFEEVENTS1_13_13", "QS3_4_LIFEEVENTS1_14_14", "QS3_4_LIFEEVENTS1_15_15", "QS3_4_LIFEEVENTS1_16_16", "QS3_4_LIFEEVENTS1_17_17", "QS3_4_LIFEEVENTS1_18_18", "QS3_4_LIFEEVENTS1_19_19", "QS3_4_LIFEEVENTS1_20_20", "QS4_1_MEANINGFULPERSON", "QS4_4_EDUCATIONALEXPEC", "QS4_6_DISAPPOINTED",  "QS4_13_LIFEEVE1_1_1", "QS4_13_LIFEEVE1_2_2", "QS4_13_LIFEEVE1_3_3", "QS4_13_LIFEEVE1_4_4", "QS4_13_LIFEEVE1_5_5", "QS4_13_LIFEEVE1_6_6",  "QS4_16_FORMALVOL", "QS4_17_SERVEDASM", "QS4_18_CURRENTOR", 
    "QS4_21_MENTORING", "QS4_22_PASTMENTO",  "QS4_25_FUTUREMEN",
    
"QS4_26_INTERNETC", "QS4_27_INTERNETC1_1_1", "QS4_27_INTERNETC1_2_2", "QS4_27_INTERNETC1_3_3", "QS4_27_INTERNETC1_4_4", "QS4_28_INTERNETCON", "QS4_29_PRIVATECONN", "QS4_30_INTERNETCON", "QS4_31_MOBILECONNE", "QS4_32_MOBILECONNE1_1_1", "QS4_32_MOBILECONNE1_2_2", "QS4_32_MOBILECONNE1_3_3", "QS4_32_MOBILECONNE1_4_4", "QS4_33_MOBILECONNECT"
    )

youth <- youth |>
  mutate(across(all_of(cate_vars), as_factor))

6.4 Identify Likert scale items columns (ordinal)

ordinal_numeric_vars <- c("QS1_18_PARENTEDUC1", "QS1_18_PARENTEDUC2", "QS2_19_DURATION_1", "QS2_20_EXPERIENCE_1", "QS2_19_DURATION_2", "QS2_20_EXPERIENCE_2", "QS2_19_DURATION_3", "QS2_20_EXPERIENCE_3", "QS2_30_MATCHSIMILAR1_1_1", "QS2_30_MATCHSIMILAR1_2_2", "QS2_30_MATCHSIMILAR1_3_3", "QS2_30_MATCHSIMILAR1_4_4", "QS2_30_MATCHSIMILAR1_5_5", "QS2_31_MENTORINGREL1_1_1", "QS2_31_MENTORINGREL1_2_2", "QS2_31_MENTORINGREL1_3_3", "QS2_31_MENTORINGREL1_4_4", "QS2_31_MENTORINGREL1_5_5", "QS2_32_MENTORINGENG1_1_1", "QS2_32_MENTORINGENG1_2_2", "QS2_32_MENTORINGENG1_3_3", "QS2_32_MENTORINGENG1_4_4", "QS2_32_MENTORINGENG1_5_5", "QS2_32_MENTORINGENG1_6_6", "QS2_32_MENTORINGENG1_7_7", "QS2_32_MENTORINGENG1_8_8", "QS2_32_MENTORINGENG1_9_9", "QS2_32_MENTORINGENG1_10_10", "QS2_32_MENTORINGENG1_11_11", "QS2_32_MENTORINGENG1_12_12", "QS2_32_MENTORINGENG1_13_13", "QS2_32_MENTORINGENG1_14_14", "QS2_32_MENTORINGENG1_15_15", "QS2_32_MENTORINGENG1_16_16", "QS2_32_MENTORINGENG1_17_17", "QS2_32_MENTORINGENG1_18_18", "QS2_32_MENTORINGENG1_19_19", "QS2_32_MENTORINGENG1_20_20", "QS2_32_MENTORINGENG1_21_21", "QS2_32_MENTORINGENG1_22_22", "QS2_35_SUPPORTSIMPO1_1_1", "QS2_35_SUPPORTSIMPO1_2_2", "QS2_35_SUPPORTSIMPO1_3_3", "QS2_35_SUPPORTSIMPO1_4_4", "QS2_35_SUPPORTSIMPO1_5_5", "QS2_35_SUPPORTSIMPO1_6_6", "QS2_35_SUPPORTSIMPO1_7_7", "QS2_35_SUPPORTSIMPO1_8_8", "QS2_35_SUPPORTSIMPO1_9_9", "QS2_35_SUPPORTSIMPO1_10_10", "QS2_37_HELPFULNESS", 
# "QS_40_REMATCHING_2", "QS_40_REMATCHING_3", 
"QS3_1_GLOBALSELFWOR1_1_1", "QS3_1_GLOBALSELFWOR1_2_2", "QS3_1_GLOBALSELFWOR1_3_3", "QS3_1_GLOBALSELFWOR1_4_4", "QS3_1_GLOBALSELFWOR1_5_5", "QS3_1_GLOBALSELFWOR1_6_6", "QS3_1_GLOBALSELFWOR1_7_7", "QS3_1_GLOBALSELFWOR1_8_8", "QS3_5_SCHOOLCLIMATE1_1_1", "QS3_5_SCHOOLCLIMATE1_2_2", "QS3_5_SCHOOLCLIMATE1_3_3", "QS3_5_SCHOOLCLIMATE1_4_4", "QS3_5_SCHOOLCLIMATE1_5_5", "QS3_5_SCHOOLCLIMATE1_6_6", "QS3_5_SCHOOLCLIMATE1_7_7", "QS3_5_SCHOOLCLIMATE1_8_8", "QS3_5_SCHOOLCLIMATE1_9_9", "QS3_5_SCHOOLCLIMATE1_10_10", "QS4_2_MEANINGFULPERSON",    "QS4_8_HELPSEEKING1_1_1", "QS4_8_HELPSEEKING1_2_2", "QS4_8_HELPSEEKING1_3_3", "QS4_8_HELPSEEKING1_4_4", "QS4_8_HELPSEEKING1_5_5", "QS4_8_HELPSEEKING1_6_6", "QS4_8_HELPSEEKING1_7_7", "QS4_8_HELPSEEKING1_8_8", "QS4_8_HELPSEEKING1_9_9", "QS4_8_HELPSEEKING1_10_10", "QS4_10_MENTALWELLBE1_1_1", "QS4_10_MENTALWELLBE1_2_2", "QS4_10_MENTALWELLBE1_3_3", "QS4_10_MENTALWELLBE1_4_4", "QS4_10_MENTALWELLBE1_5_5", "QS4_10_MENTALWELLBE1_6_6", "QS4_10_MENTALWELLBE1_7_7"
                          ) 

ordinal_chr_vars <- c("QS1_22_HIGHESTEDU", "QS2_6_MENTOREXPER", "QS2_34_SUPPORTS1_1_1", "QS2_34_SUPPORTS1_2_2", "QS2_34_SUPPORTS1_3_3", "QS2_34_SUPPORTS1_4_4", "QS2_34_SUPPORTS1_5_5", "QS2_34_SUPPORTS1_6_6", "QS2_34_SUPPORTS1_7_7", "QS2_34_SUPPORTS1_8_8", "QS2_34_SUPPORTS1_9_9", "QS2_36_INFLUENCE1_1_1", "QS2_36_INFLUENCE1_2_2", "QS2_36_INFLUENCE1_3_3", "QS2_36_INFLUENCE1_4_4", "QS2_36_INFLUENCE1_5_5", "QS2_36_INFLUENCE1_6_6", "QS2_36_INFLUENCE1_7_7", "QS2_36_INFLUENCE1_8_8", "QS2_36_INFLUENCE1_9_9", "QS4_3_CAREERPLANNIN1_1_1", "QS4_3_CAREERPLANNIN1_2_2", "QS4_3_CAREERPLANNIN1_3_3", "QS4_3_CAREERPLANNIN1_4_4", "QS4_3_CAREERPLANNIN1_5_5", "QS4_3_CAREERPLANNIN1_6_6", "QS4_3_CAREERPLANNIN1_7_7", "QS4_3_CAREERPLANNIN1_8_8",  "QS4_5_SATEDU", "QS4_7_SOCIALCAPITAL1_1_1", "QS4_7_SOCIALCAPITAL1_2_2", "QS4_7_SOCIALCAPITAL1_3_3", "QS4_7_SOCIALCAPITAL1_4_4","QS4_9_MENTALHEALTH",  "QS4_11_BELONGING","QS4_20_MENTEEAGE", "QS4_24_FUTUREMEN")

6.4.1 Handelling Varied Likert Scale Items: Label Alignment Issue

6.4.1.1 Task: Reverse coding for raw ordinal columns with numeric encoding:

For some ordinal columns (numerically encoded ones), there is incongruency in their valance direction. The issue at hand is to ensure higher numeric values will consistently represent the ‘positive’ or ‘high’ end of the scale (e.g., agreement, positivity, truth) across all variables:

Question 11: QS1_18_PARENTEDUC*
Question 20f: QS2_19_DURATION_*
- <3 months, 3-6 months, 7-11 months, 12 months or more, It’s still ongoing
Question 20g: QS2_19_EXPERIENCE_*
- Always positive; Mostly positive; Neutral; Mostly negative; Always negative
Question 26: QS2_30_MATCHSIMILAR1_*_*
- (1) Disagree (2) Neither agree nor disagree (3) Agree
Question 27: QS2_31_MENTORINGREL1_*_*
- 1. Strongly disagree 2. Disagree 3. Neither agree nor disagree 4. Agree 5. Strongly agree
Question 28: QS2_32_MENTORINGENG1_*_*
- 1.Not very true—2.Sometimes true—3.Very true
Question 30: QS2_35_SUPPORTSIMPO1_6_6
- Not very true—Sometimes true—Very true
Question 36: QS3_1_GLOBALSELFWOR1_2_2
- (1) Strongly disagree (2) Disagree (3) Agree (4) Strongly agree
Question 39: QS3_5_SCHOOLCLIMATE1_*_*
- Strongly Agree——- A——-D——–Strongly Disagree
Question44: QS4_8_HELPSEEKING1_*_*
- (1) Not at all true (2) (3) (4) (5) Completely true
Question 46 QS4_10_MENTALWELLBE1_*_*:
- (1) None of the time — (2) Rarely — (3) Some of the time — (4) Often — (5) All of the time

Steps: QS2_19_EXPERIENCE_(1-5 scale: Always positive to Always negative) and QS3_5_SCHOOLCLIMATE1_ (1-4 scale: Strongly Agree to Strongly Disagree) require reverse coding.

reversed_coding <- youth |>
                  mutate(# Reverse QS2_19_EXPERIENCE_* (1-5 scale: Always positive to Always negative)
                         across(matches("^QS2_19_EXPERIENCE_"), ~ 6 - .,  .names = "reversed_{.col}"),
                      
                         # Reverse QS3_5_SCHOOLCLIMATE1_*_* (1-4 scale: Strongly Agree to Strongly Disagree)
                         across(matches("^QS3_5_SCHOOLCLIMATE1_"), ~ 5 - ., .names = "reversed_{.col}"))

# Validate reversed coding:
reversed_coding |>
  select(starts_with("QS2_19_EXPERIENCE_"), starts_with("reversed_QS2_19_EXPERIENCE_"),
         starts_with("QS3_5_SCHOOLCLIMATE1_"), starts_with("reversed_QS3_5_SCHOOLCLIMATE1_")) %>%
  head()

# A tibble: 6 × 20
  QS3_5_SCHOOLCLIMATE1_1_1 QS3_5_SCHOOLCLIMATE1_2_2 QS3_5_SCHOOLCLIMATE1_3_3
                     <dbl>                    <dbl>                    <dbl>
1                        4                        4                        4
2                        1                        2                        2
3                        1                        1                        1
4                        3                        2                        2
5                        3                        3                        3
6                        2                        2                        3
# ℹ 17 more variables: QS3_5_SCHOOLCLIMATE1_4_4 <dbl>,
#   QS3_5_SCHOOLCLIMATE1_5_5 <dbl>, QS3_5_SCHOOLCLIMATE1_6_6 <dbl>,
#   QS3_5_SCHOOLCLIMATE1_7_7 <dbl>, QS3_5_SCHOOLCLIMATE1_8_8 <dbl>,
#   QS3_5_SCHOOLCLIMATE1_9_9 <dbl>, QS3_5_SCHOOLCLIMATE1_10_10 <dbl>,
#   reversed_QS3_5_SCHOOLCLIMATE1_1_1 <dbl>,
#   reversed_QS3_5_SCHOOLCLIMATE1_2_2 <dbl>,
#   reversed_QS3_5_SCHOOLCLIMATE1_3_3 <dbl>, …

reversed_coding shows that reversed_QS2_19_EXPERIENCE_ and reversed_QS3_5_SCHOOLCLIMATE1_ are successful reverse encoding of QS2_19_EXPERIENCE_ and QS3_5_SCHOOLCLIMATE1_!

Now, let’s apply the this reverse encoding onto the original youth data set, replacing the QS2_19_EXPERIENCE_ and QS3_5_SCHOOLCLIMATE1_ directly in the original data frame:

youth <- youth |>
  mutate(# Reverse QS2_19_EXPERIENCE_* (1-5 scale: Always positive to Always negative)
         across(matches("^QS2_19_EXPERIENCE_"), ~ 6 - .),
      
         # Reverse QS3_5_SCHOOLCLIMATE1_*_* (1-4 scale: Strongly Agree to Strongly Disagree)
         across(matches("^QS3_5_SCHOOLCLIMATE1_"), ~ 5 - .))

# Once again, validate result:
youth |>
  select(starts_with("QS2_19_EXPERIENCE_"), starts_with("QS3_5_SCHOOLCLIMATE1_")) |>
  head()

# A tibble: 6 × 10
  QS3_5_SCHOOLCLIMATE1_1_1 QS3_5_SCHOOLCLIMATE1_2_2 QS3_5_SCHOOLCLIMATE1_3_3
                     <dbl>                    <dbl>                    <dbl>
1                        1                        1                        1
2                        4                        3                        3
3                        4                        4                        4
4                        2                        3                        3
5                        2                        2                        2
6                        3                        3                        2
# ℹ 7 more variables: QS3_5_SCHOOLCLIMATE1_4_4 <dbl>,
#   QS3_5_SCHOOLCLIMATE1_5_5 <dbl>, QS3_5_SCHOOLCLIMATE1_6_6 <dbl>,
#   QS3_5_SCHOOLCLIMATE1_7_7 <dbl>, QS3_5_SCHOOLCLIMATE1_8_8 <dbl>,
#   QS3_5_SCHOOLCLIMATE1_9_9 <dbl>, QS3_5_SCHOOLCLIMATE1_10_10 <dbl>

6.4.2 Convert all string/character Likert scale items columns into ordinal data type:

youth <- youth |>
  mutate(across(all_of(ordinal_numeric_vars), as_factor)) # converts ordinal items encoded in either (1) numbers or (2) strings into factor.
youth <- youth |>
  mutate(across(all_of(ordinal_chr_vars), as_factor))

6.4.3 Convert string/character Likert scale level columns into ordered numeric:

Specify string/character-labeled Likert levels:

experience_ord <- list(levels = 1:5, labels = c("Always negative", "Mostly negative", "Somewhat positive", "Mostly positive", "Always positive"))
support_ord <- list(levels = 1:3, labels = c("Not very true","Sometimes true","Very true"))
influence_ord <- list(levels = 1:4, labels = c("A little","Some","Quite a bit","A lot")) # No 'None' in response range
career_plan_ord <- list(levels = 1:7, labels = c("Completely disagree","Disagree","Somewhat disagree","Unsure","Somewhat agree","Agree","Completely agree"))
social_capital_ord <- list(levels = 1:5, labels = c("Strongly disagree","Disagree","Neutral","Agree","Strongly Agree"))
mental_health_ord <- list(levels = 1:5, labels = c("Poor","Fair","Good","Very good", "Excellent")) 
# "Don't know"  / 'Prefer not to answer' is not included (previously filtered)
belonging_ord <- list(levels = 1:4, labels = c("Very weak","Somewhat weak","Somewhat strong","Very strong"))
mentee_age_ord <- list(levels = 1:4, labels = c("Under 18 years old","18-25 years old","26-29 years old","30 years old or older"))
mentoring_interest_ord <- list(levels = 1:4, labels = c("Not interested at all", "Not that interested","Fairly interested","Very interested"))

high_edu_ord <- list(levels = 1:11, labels = c("Certificate of Apprenticeship or Certificate of Qualification", 
                                               "Other trades certificate or diploma", 
                                               "College, CEGEP or other non-university certificate or diploma", 
                                               "University diploma or certificate BELOW Bachelor’s Degree", 
                                               "Bachelor’s degree (e.g. BA, BSc, B.Ed., B.Eng including LL.B-law degree)",
                                               "University diploma or certificate ABOVE Bachelor’s Degree",
                                               "First professional degree (e.g. medicine, dentistry, veterinary medicine, optometry)", 
                                               "Master's degree (e.g. M.B.A., M.A., M.Sc.)", 
                                               "Doctorate degree (earned e.g. Ph.D, D.Sc., D.Ed) or post-doctoral program", 
                                               "Other, please specify", 
                                               "Unsure"))

Compile all character-labeled Likert items in a list (i.e., a list of list in this case).

lst_of_ord_cols <- list(QS2_6_MENTOREXPER = experience_ord,
                        
                        QS2_34_SUPPORTS1_1_1 = support_ord,
                        QS2_34_SUPPORTS1_2_2 = support_ord,
                        QS2_34_SUPPORTS1_3_3 = support_ord,
                        QS2_34_SUPPORTS1_4_4 = support_ord,
                        QS2_34_SUPPORTS1_5_5 = support_ord,
                        QS2_34_SUPPORTS1_6_6 = support_ord,
                        QS2_34_SUPPORTS1_7_7 = support_ord,
                        QS2_34_SUPPORTS1_8_8 = support_ord,
                        QS2_34_SUPPORTS1_9_9 = support_ord,
                        
                        QS2_36_INFLUENCE1_1_1 = influence_ord,
                        QS2_36_INFLUENCE1_2_2 = influence_ord,
                        QS2_36_INFLUENCE1_3_3 = influence_ord,
                        QS2_36_INFLUENCE1_4_4 = influence_ord,
                        QS2_36_INFLUENCE1_5_5 = influence_ord,
                        QS2_36_INFLUENCE1_6_6 = influence_ord,
                        QS2_36_INFLUENCE1_7_7 = influence_ord,
                        QS2_36_INFLUENCE1_8_8 = influence_ord,
                        QS2_36_INFLUENCE1_9_9 = influence_ord,
                        
                        QS4_3_CAREERPLANNIN1_1_1 = career_plan_ord,
                        QS4_3_CAREERPLANNIN1_2_2 = career_plan_ord,
                        QS4_3_CAREERPLANNIN1_3_3 = career_plan_ord,
                        QS4_3_CAREERPLANNIN1_4_4 = career_plan_ord,
                        QS4_3_CAREERPLANNIN1_5_5 = career_plan_ord,
                        QS4_3_CAREERPLANNIN1_6_6 = career_plan_ord,
                        QS4_3_CAREERPLANNIN1_7_7 = career_plan_ord,
                        QS4_3_CAREERPLANNIN1_8_8 = career_plan_ord,
                        
                        QS4_7_SOCIALCAPITAL1_1_1 = social_capital_ord,
                        QS4_7_SOCIALCAPITAL1_2_2 = social_capital_ord,
                        QS4_7_SOCIALCAPITAL1_3_3 = social_capital_ord,
                        QS4_7_SOCIALCAPITAL1_4_4 = social_capital_ord,
                        
                        QS4_9_MENTALHEALTH = mental_health_ord,
                        QS4_11_BELONGING = belonging_ord,
                        QS4_20_MENTEEAGE = mentee_age_ord,
                        QS4_24_FUTUREMEN = mentoring_interest_ord,

                        
                        QS1_22_HIGHESTEDU = high_edu_ord,
                        QS4_5_SATEDU = high_edu_ord)

Write a function to convert all character labels to the correct ordinal numeric scores!

6.4.3.0.1 Function 1: For converting string type ordinal leveled columns into well-ordered integers:

convert_string_ord_fac <- function(data, lst_ord_level_cols, lst_col_names) {
 data <- data |>
  mutate(across(.cols = all_of(lst_col_names),
                .fns = ~ {
                  col_name <- cur_column()
                  scale <- lst_ord_level_cols[[col_name]]
                  factor(.x, 
                         levels = scale$labels, # the string lebels are the 'levels'
                         labels = scale$levels, # now we want the `labels` to be numeric, according to the levels.
                         ordered = TRUE)}))
    return(data)}

Apply above function convert_ord_fac()

youth |> select(all_of(c(ordinal_chr_vars))) |> str() # before conversion
youth <- convert_string_ord_fac(youth, lst_of_ord_cols, ordinal_chr_vars)

# youth |> select(all_of(c(ordinal_chr_vars))) |> str() # after conversion
  # Validated! Success!

6.4.3.0.2 Function 2: For converting numeric type ordinal leveled columns into ordered factor

convert_number_ord_fac <- function(data, lst_col_names) {
  data[lst_col_names] <- lapply(data[lst_col_names], function(x) {
    factor(x, levels = sort(unique(x)), ordered = TRUE)
  })
  
  return(data)}

Apply above Function 2 to ordinal_numeric_vars

youth |> select(all_of(c(ordinal_numeric_vars))) |> str() # before conversion

youth <- convert_number_ord_fac(youth, ordinal_numeric_vars)
# youth |> select(all_of(c(ordinal_numeric_vars))) |> str() # after conversion
  # Validated! Another success!

6.4.4 `Unsure` response entries in ordinal columns:

6.4.4.1 `QS4_3_CAREERPLANNIN1_1_1` to `QS4_3_CAREERPLANNIN1_8_8`: The only ordinal column with `Unsure` entries

The following table examines number of Unsure entries here: 40 “Unsure” responses:

# Summary number of `Unsure` responses for QS4_3_CAREERPLANNIN:
bind_cols(list(count_career_1, count_career_2, count_career_3, count_career_4, count_career_5, count_career_6, count_career_7, count_career_8))

# A tibble: 1 × 8
  count_career_1 count_career_2 count_career_3 count_career_4 count_career_5
           <int>          <int>          <int>          <int>          <int>
1            342            308            380            343            442
# ℹ 3 more variables: count_career_6 <int>, count_career_7 <int>,
#   count_career_8 <int>

Decision: We reason that since Unsure in the context of this question serve similar to a ‘undecided’ stance between the two Likert end, we would retain Unsure for this ordinal item.

6.4.5 `Unsure` and `Other, please specify` entries in `QS4_4_EDUCATIONALEXPEC`

Other, please specify = 10
Unsure = 11

cat_unsure_pnts_10 <- c("QS1_22_HIGHESTEDU", "QS4_4_EDUCATIONALEXPEC") # 10 = Other, 11 = Unsure
youth |>
select(QS1_22_HIGHESTEDU, QS4_4_EDUCATIONALEXPEC) |>
mutate(across(everything(), as.numeric)) |>
filter(QS1_22_HIGHESTEDU > 10) |>
summarize(count_high_edu_na = n())

# A tibble: 1 × 1
  count_high_edu_na
              <int>
1               110

A total of 110 respondents who had answered either Unsure or Other, please specify for question 13a (QS1_22_HIGHESTEDU).

youth |>
select(QS1_22_HIGHESTEDU, QS4_4_EDUCATIONALEXPEC) |>
mutate(across(everything(), as.numeric)) |>
filter(QS4_4_EDUCATIONALEXPEC >= 10) |>
summarize(count_high_edu_na = n())

# A tibble: 1 × 1
  count_high_edu_na
              <int>
1                91

A total of 91 respondents who had answered either Unsure or Other, please specify for question 42 (QS4_4_EDUCATIONALEXPEC).

Decision: We choose retain Unsure for this ordinal item.

6.4.6 Remove `_PreferNotToSay`, `_Unsure`, or `*_Other` OHE’d categorical variables directly from raw data:

Downstream Plan 1: These *_PreferNotToSay, *_Unsure, and *_Other ending variables are already in the raw dataframe. The subsequent one-hot encoding we’ll perform on other nominal variables that had not been properly OHE’d will go through the same removal once again.
Downstream Plan 2: There are binary (Yes or No) responses to some of the categorical columns as well. After the downstream OHE, the No column corresponding to those binary columns will be removed.

youth |> 
  select((ends_with(c("_PreferNotToSay", "_Unsure", "_Other")))) |>
  ncol()

[1] 21

# Removal:
youth <- youth |> 
  select(-(ends_with(c("_PreferNotToSay", "_Unsure", "_Other"))))

There are 21 columns of *_PreferNotToSay, *_Unsure, or *_Other categorical variables, and these columns have been removed.

7 Data train-test split

split <- initial_split(youth, prop = .80, strata = QS1_28_EMPLOYMENT_calculated)
train_data <- training(split)
test_data <- testing(split)

7.0.1 Impute missing (NA) entries in ordinal columns:

Imputation should be done after train/test data splitting to prevent test data leakage into the train set!

preprocessing_recipe <- recipes::recipe(QS1_28_EMPLOYMENT_calculated ~ ., data = train_data) |>
    # step_impute_knn(all_of(c(all_nominal_predictors(), 
    #                 all_numeric_predictors())), 
    #                 -all_of(metric_vars), neighbors = 5) |>  # KNN Imputation is too computational time consuming for this large dataset without special handling-- we pass it just for now.

    step_impute_mode(all_ordered_predictors()) |>
    step_mutate_at(   all_ordered_predictors(), fn = as.numeric) |>

    step_impute_mode(all_nominal_predictors()) |>

    # cate_vars are those vars that were categorical but not ohe.
    step_dummy(any_of(all_nominal_predictors()), one_hot = TRUE) |> 

    # Standardize continuous variables:
    step_center(all_of(metric_vars)) |>
    step_scale(all_of(metric_vars)) |>

    # Mean impute continuous variables:
    step_impute_mean(all_of(metric_vars)) |>

    step_impute_median(all_numeric_predictors(), -all_of(metric_vars)) |>

    # Impute income output with its median
    step_impute_median(all_outcomes())

Comment:

Code viewers can run the following code snippet in the console to see concise count of each ordinal level! :
    mean_mode_imputed_train |> select(all_of(c(ordinal_numeric_vars, ordinal_chr_vars))) |> summary()
    mean_mode_imputed_train |> select(all_of(ordinal_chr_vars)) |> summary()
Continuous variables are standardized.

mean_mode_impute_prep <- preprocessing_recipe |>
                         prep()

mean_mode_imputed_train <- bake(mean_mode_impute_prep, new_data = NULL)
mean_mode_imputed_test   <- bake(mean_mode_impute_prep, new_data = test_data)

7.0.2 Only one-hot encode the nominal columns without imputation:

Continuous variables are standardized.

preprocessing_recipe_ohe_unimputed <- recipes::recipe(QS1_28_EMPLOYMENT_calculated ~ ., data = train_data) |>
    step_dummy(all_of(cate_vars), one_hot = TRUE) |>

    step_center(all_of(metric_vars)) |>
    step_scale(all_of(metric_vars))

ohe_unimpute_prep <- preprocessing_recipe_ohe_unimputed |>
                     prep()

ohe_unimputed_train <- bake(ohe_unimpute_prep, new_data = NULL)
ohe_unimputed_test  <- bake(ohe_unimpute_prep, new_data = test_data)

Validate post-ohe, column names are the same between mean_mode_imputed_train and ohe_unimputed_train:

identical(ncol(ohe_unimputed_train), ncol(mean_mode_imputed_train))# Outputs TRUE! They are the same!

[1] TRUE

identical(nrow(ohe_unimputed_train), nrow(mean_mode_imputed_train))# Outputs TRUE! They are the same!

[1] TRUE

nrow(ohe_unimputed_train)

[1] 1915

nrow(mean_mode_imputed_train)

[1] 1915

nrow(youth)

[1] 2396

ncol(mean_mode_imputed_train)

[1] 699

ncol(ohe_unimputed_train)

[1] 699

ncol(youth)

[1] 401

7.1 Examine new one-hot encoded columns:

Task 1: Let’s look at how many new one-hot encoded columns contain *_PreferNotToSay, *_Unsure, and *_Other endings, which need to be removed.

Task 2: Also look at how many new ohe’d columns contain the ending of: *_No and numeric encoding for *_PreferNotToSay, *_Unsure, and *_Other.

7.1.0.1 Remove `_PreferNotToSay`, `_Prefer.not.to.answer`, `_Unsure`, and `_Other`:

mean_mode_imputed_train |> 
select((ends_with(c("_PreferNotToSay", "_Prefer.not.to.say", "_Prefer.not.to.answer", "_Unsure", "_Other")))) |>
ncol()

[1] 43

ohe_unimputed_train |>
select((ends_with(c("_PreferNotToSay", "_Prefer.not.to.say", "_Prefer.not.to.answer", "_Unsure", "_Other")))) |>
ncol()

[1] 43

Both imputed and unimputed OHE’d train datasets output 24, meaning there are 24 columns of “*_PreferNotToSay”, “_Unsure”, “_Other”.

# Removals (train):
mean_mode_imputed_train <- mean_mode_imputed_train |> 
  select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))

ohe_unimputed_train <- ohe_unimputed_train |> 
  select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))

# Removals (test):
mean_mode_imputed_test <- mean_mode_imputed_test |> 
  select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))

ohe_unimputed_test <- ohe_unimputed_test |> 
  select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))

7.1.0.2 Remove `*_No`

mean_mode_imputed_train |> 
select((contains(c("_No")))) |>
ncol()

[1] 47

mean_mode_imputed_train |> 
select((ends_with(c("_No")))) |>
ncol()

[1] 38

ohe_unimputed_train |> 
select((ends_with(c("_No")))) |>
ncol()

[1] 38

Both imputed and unimputed OHE’d train datasets output 38, meaning there are 38 columns of “*_No”.

# Removals (train):
mean_mode_imputed_train <- mean_mode_imputed_train |> 
  select(-(contains(c("_No"))))

ohe_unimputed_train <- ohe_unimputed_train |> 
  select(-(contains(c("_No"))))

# Removals (test):
mean_mode_imputed_test <- mean_mode_imputed_test |> 
  select(-(contains(c("_No"))))

ohe_unimputed_test <- ohe_unimputed_test |> 
  select(-(contains(c("_No"))))

7.1.0.3 Remove numerically encoded `_PreferNotToSay`, `_Unsure`, `_Other`, and `_No`:

7.1.0.4 _PreferNotToSay: For convenience, we sometimes use ‘pnts’ short for Prefer Not to Say.

# >=2 is no/unsure/pnts:
cat_unsure_pnts_2_or_more <- c("QS1_5_INDIGENOUSHS", "QS1_17_INCARE", "QS1_27_PLANNINGRE", 
                               "QS2_5_MENTOR611PR", "QS2_7_MENTOR611SE", "QS2_11_MENTOR1218",
                               'QS2_22_GEOLOCATI1', 'QS2_22_GEOLOCATI2', 'QS2_22_GEOLOCATI3', 
                               'QS3_4_LIFEEVENTS1_1_1', 'QS3_4_LIFEEVENTS1_2_2', 'QS3_4_LIFEEVENTS1_3_3', 
                               'QS3_4_LIFEEVENTS1_4_4', 'QS3_4_LIFEEVENTS1_5_5', 'QS3_4_LIFEEVENTS1_6_6', 
                               'QS3_4_LIFEEVENTS1_7_7', 'QS3_4_LIFEEVENTS1_8_8', 'QS3_4_LIFEEVENTS1_9_9', 
                               'QS3_4_LIFEEVENTS1_10_10', 'QS3_4_LIFEEVENTS1_11_11', 'QS3_4_LIFEEVENTS1_12_12', 
                               'QS3_4_LIFEEVENTS1_13_13', 'QS3_4_LIFEEVENTS1_14_14', 'QS3_4_LIFEEVENTS1_15_15', 
                               'QS3_4_LIFEEVENTS1_16_16', 'QS3_4_LIFEEVENTS1_17_17', 'QS3_4_LIFEEVENTS1_18_18', 
                               'QS3_4_LIFEEVENTS1_19_19', 'QS3_4_LIFEEVENTS1_20_20',  'QS4_6_DISAPPOINTED', 
                               'QS4_13_LIFEEVE1_1_1', 'QS4_13_LIFEEVE1_2_2', 'QS4_13_LIFEEVE1_3_3',
                               'QS4_13_LIFEEVE1_4_4', 'QS4_13_LIFEEVE1_5_5', 'QS4_13_LIFEEVE1_6_6', 
                               'QS4_16_FORMALVOL', 'QS4_26_INTERNETC', 
                               'QS4_28_INTERNETCON', 'QS4_31_MOBILECONNE', 'QS4_33_MOBILECONNECT')
  

cat_unsure_pnts_3 <- c("QS4_1_MEANINGFULPERSON",
                       "QS4_21_MENTORING",
                      "QS4_29_PRIVATECONN"
                    )  # >=3 is no/unsure/pnts

cat_unsure_pnts_4 <- c("QS4_22_PASTMENTO",
                       "QS2_17_TYPE_1", 
                       "QS2_17_TYPE_2",
                       "QS2_17_TYPE_3",
                       "QS2_18_LOCATION_1",
                       "QS2_18_LOCATION_2", 
                       "QS2_18_LOCATION_3",
                      "QS4_30_INTERNETCON")   # >=4 is unsure/pnts


cat_unsure_pnts_11 <- c("QS4_4_EDUCATIONALEXPEC")  # >=X11 is unsure/pnts

cat_X0_no <- c("QS4_27_INTERNETC1_1_1", "QS4_27_INTERNETC1_2_2", "QS4_27_INTERNETC1_3_3", "QS4_27_INTERNETC1_4_4", "QS4_32_MOBILECONNE1_1_1", "QS4_32_MOBILECONNE1_2_2", "QS4_32_MOBILECONNE1_3_3", "QS4_32_MOBILECONNE1_4_4")


# *** also need to remove QS4_30_INTERNETCON_X2 independently

Variables listed in cat_unsure_pnts_3_or_more contain Unsure and/or Prefer not to say at encoding Unsure = 3 and Prefer not to say = 4.

Variables listed in cat_unsure_pnts_4 contain Unsure and/or Prefer not to say at encoding Unsure = 4 and/or Prefer not to say = 5.

Variables listed in cat_unsure_pnts_11 contain Unsure and/or Prefer not to say at encoding Unsure = 11 and Prefer not to say = 12.

Internet access variables listed in cat_internet_access contain Unsure and/or Prefer not to say at encoding Unsure = 3 and Prefer not to say = 4.

# Removal of cat_unsure_pnts_3_or_more (train)

mean_mode_imputed_train <- mean_mode_imputed_train |>
  select(-matches(paste0(
    "(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
    "(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
    "(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
    "(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
    "QS4_30_INTERNETCON_X2"
  )))

mean_mode_imputed_train <- mean_mode_imputed_train |> 
  select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11


# Verify removal:
remaining_cols <- colnames(mean_mode_imputed_train)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(mean_mode_imputed_train))]

if (length(remaining_cols) > 0) {
  message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
  print(remaining_cols)
} else {
  message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
}

 [1] "QS2_17_TYPE_1_X2"           "QS2_17_TYPE_1_X3"          
 [3] "QS2_18_LOCATION_1_X2"       "QS2_18_LOCATION_1_X3"      
 [5] "QS2_17_TYPE_2_X2"           "QS2_17_TYPE_2_X3"          
 [7] "QS2_18_LOCATION_2_X2"       "QS2_18_LOCATION_2_X3"      
 [9] "QS2_17_TYPE_3_X2"           "QS2_17_TYPE_3_X3"          
[11] "QS2_18_LOCATION_3_X2"       "QS2_18_LOCATION_3_X3"      
[13] "QS2_23_MOSTMEANI_X2"        "QS2_23_MOSTMEANI_X3"       
[15] "QS4_1_MEANINGFULPERSON_X2"  "QS4_4_EDUCATIONALEXPEC_X2" 
[17] "QS4_4_EDUCATIONALEXPEC_X3"  "QS4_4_EDUCATIONALEXPEC_X4" 
[19] "QS4_4_EDUCATIONALEXPEC_X5"  "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2"        "QS4_22_PASTMENTO_X2"       
[23] "QS4_22_PASTMENTO_X3"        "QS4_29_PRIVATECONN_X2"     
[25] "QS4_30_INTERNETCON_X3"

# Removal of ohe_unimputed_train (train)

ohe_unimputed_train <- ohe_unimputed_train |> 
  select(-matches(paste0(
    "(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
    "(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
    "(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
    "(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
    "QS4_30_INTERNETCON_X2"
  )))

ohe_unimputed_train <- ohe_unimputed_train |> 
  select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11


# Verify removal:
remaining_cols <- colnames(ohe_unimputed_train)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(ohe_unimputed_train))]

if (length(remaining_cols) > 0) {
  message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
  print(remaining_cols)
} else {
  message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
}

 [1] "QS2_17_TYPE_1_X2"           "QS2_17_TYPE_1_X3"          
 [3] "QS2_18_LOCATION_1_X2"       "QS2_18_LOCATION_1_X3"      
 [5] "QS2_17_TYPE_2_X2"           "QS2_17_TYPE_2_X3"          
 [7] "QS2_18_LOCATION_2_X2"       "QS2_18_LOCATION_2_X3"      
 [9] "QS2_17_TYPE_3_X2"           "QS2_17_TYPE_3_X3"          
[11] "QS2_18_LOCATION_3_X2"       "QS2_18_LOCATION_3_X3"      
[13] "QS2_23_MOSTMEANI_X2"        "QS2_23_MOSTMEANI_X3"       
[15] "QS4_1_MEANINGFULPERSON_X2"  "QS4_4_EDUCATIONALEXPEC_X2" 
[17] "QS4_4_EDUCATIONALEXPEC_X3"  "QS4_4_EDUCATIONALEXPEC_X4" 
[19] "QS4_4_EDUCATIONALEXPEC_X5"  "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2"        "QS4_22_PASTMENTO_X2"       
[23] "QS4_22_PASTMENTO_X3"        "QS4_29_PRIVATECONN_X2"     
[25] "QS4_30_INTERNETCON_X3"

# Removal of cat_unsure_pnts_3_or_more (test)

mean_mode_imputed_test <- mean_mode_imputed_test |>
  select(-matches(paste0(
    "(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
    "(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
    "(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
    "(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
    "QS4_30_INTERNETCON_X2"
  )))

mean_mode_imputed_test <- mean_mode_imputed_test |> 
  select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11


# Verify removal:
remaining_cols <- colnames(mean_mode_imputed_test)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(mean_mode_imputed_test))]

if (length(remaining_cols) > 0) {
  message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
  print(remaining_cols)
} else {
  message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
}

 [1] "QS2_17_TYPE_1_X2"           "QS2_17_TYPE_1_X3"          
 [3] "QS2_18_LOCATION_1_X2"       "QS2_18_LOCATION_1_X3"      
 [5] "QS2_17_TYPE_2_X2"           "QS2_17_TYPE_2_X3"          
 [7] "QS2_18_LOCATION_2_X2"       "QS2_18_LOCATION_2_X3"      
 [9] "QS2_17_TYPE_3_X2"           "QS2_17_TYPE_3_X3"          
[11] "QS2_18_LOCATION_3_X2"       "QS2_18_LOCATION_3_X3"      
[13] "QS2_23_MOSTMEANI_X2"        "QS2_23_MOSTMEANI_X3"       
[15] "QS4_1_MEANINGFULPERSON_X2"  "QS4_4_EDUCATIONALEXPEC_X2" 
[17] "QS4_4_EDUCATIONALEXPEC_X3"  "QS4_4_EDUCATIONALEXPEC_X4" 
[19] "QS4_4_EDUCATIONALEXPEC_X5"  "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2"        "QS4_22_PASTMENTO_X2"       
[23] "QS4_22_PASTMENTO_X3"        "QS4_29_PRIVATECONN_X2"     
[25] "QS4_30_INTERNETCON_X3"

# Removal of ohe_unimputed_train (test)

ohe_unimputed_test <- ohe_unimputed_test |> 
  select(-matches(paste0(
    "(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
    "(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
    "(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
    "(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
    "QS4_30_INTERNETCON_X2"
  )))

ohe_unimputed_test <- ohe_unimputed_test |> 
  select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11


# Verify removal:
remaining_cols <- colnames(ohe_unimputed_test)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(ohe_unimputed_test))]

if (length(remaining_cols) > 0) {
  message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
  print(remaining_cols)
} else {
  message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
}

 [1] "QS2_17_TYPE_1_X2"           "QS2_17_TYPE_1_X3"          
 [3] "QS2_18_LOCATION_1_X2"       "QS2_18_LOCATION_1_X3"      
 [5] "QS2_17_TYPE_2_X2"           "QS2_17_TYPE_2_X3"          
 [7] "QS2_18_LOCATION_2_X2"       "QS2_18_LOCATION_2_X3"      
 [9] "QS2_17_TYPE_3_X2"           "QS2_17_TYPE_3_X3"          
[11] "QS2_18_LOCATION_3_X2"       "QS2_18_LOCATION_3_X3"      
[13] "QS2_23_MOSTMEANI_X2"        "QS2_23_MOSTMEANI_X3"       
[15] "QS4_1_MEANINGFULPERSON_X2"  "QS4_4_EDUCATIONALEXPEC_X2" 
[17] "QS4_4_EDUCATIONALEXPEC_X3"  "QS4_4_EDUCATIONALEXPEC_X4" 
[19] "QS4_4_EDUCATIONALEXPEC_X5"  "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2"        "QS4_22_PASTMENTO_X2"       
[23] "QS4_22_PASTMENTO_X3"        "QS4_29_PRIVATECONN_X2"     
[25] "QS4_30_INTERNETCON_X3"

7.1.1 Remove Possible Dependent Variables of Interest

Data leakage is the idea of inadvertently disclosing (or leaking) information about the dependent outcome variables (or the testing data set) to predictor variables in the training set. If we allow the outcome variables to exert antecedent influences on their predictors, and subsequently use the very same predictors to predict those outcome variables, we would end up in a circular reasoning fallacy, and this can lead to falsely inflated relationship between predictors and outcomes.
To avoid data leakage, we need to carefully remove all possible dependent variables we might use as outcomes, especially BEFORE we perform KNN imputation.
- KNN imputation assess the relationship between variables to approximate missing values. If dependent variables (DVs) are included, imputed values for the independent variables (IVs) will be influenced by the DVs, leading to biased relationship between the IV and the DV.
Decision: We will passthrough the possible DVs during the KNN imputation pipeline in python.

ohe_unimputed_train |>
select(starts_with(c('QS1_19_HIGHSCHOOL', 'QS4_19_CURRENTME', 'QS4_18_CURRENTOR', 
                            'QS1_20_HIGHSCHOOL', 'QS1_21_FURTHEDUCA', 'QS1_22_HIGHESTEDU', 
                            'QS1_23_YEARCOMPLE', 'QS1_25_EMPLOYMENT', 'QS1_26_EMPLOYMENT',
                            'QS1_27_PLANNINGRE',  'QS1_28_EMPLOYMENT', 'QS4_4_EDUCATIONALEXPEC', 
                            'QS4_5_SATEDU','QS4_6_DISAPPOINTED', # these are self-motivation variable.
                            'QS4_7_SOCIALCAPITAL', 'QS4_8_HELPSEEKING', 'QS4_9_MENTALHEALTH',
                            'QS4_10_MENTALWELLBE', 'QS4_11_BELONGING', 'QS4_12_TRUST',
                            'QS4_16_FORMALVOL', 'QS4_25_FUTUREMEN', 'QS4_21_MENTORING',
                            'QS4_17_SERVEDASM', 'QS4_18_CURRENTOR', 'QS4_22_PASTMENTO', 
                            'QS4_1_MEANINGFULPERSON', 'QS4_13_LIFEEVE'))) |>
summarize(across(everything(), ~sum(is.na(.)))) |>
glimpse()

8 Data frame export:

8.0.0.1 Export train/test data sets

As comma separated file:

# write_csv(train_data, "../../dssg-2025-mentor-canada/Data/umimputed_train.csv")
# write_csv(ohe_unimputed_train, "../../dssg-2025-mentor-canada/Data/ohe_unimputed_train.csv")
# write_csv(test_data, "../../dssg-2025-mentor-canada/Data/umimputed_test.csv")
# 

# write_csv(mean_mode_imputed_train, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.csv")
# write_csv(mean_mode_impute_test, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_test.csv")

As R object (to retain the transformed data types)

# write_rds(train_data, "../../dssg-2025-mentor-canada/Data/umimputed_train.Rds")
# write_rds(test_data, "../../dssg-2025-mentor-canada/Data/umimputed_test.Rds")

# write_rds(mean_mode_imputed_train, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.Rds")

Alternative option: Try use Feature to preserve transformed data type:

write_feather(train_data, "../../dssg-2025-mentor-canada/Data/umimputed_train.feather")
write_feather(mean_mode_imputed_train, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.feather")
write_feather(ohe_unimputed_train, "../../dssg-2025-mentor-canada/Data/ohe_unimputed_train.feather") 

write_feather(test_data, "../../dssg-2025-mentor-canada/Data/umimputed_test.feather")
write_feather(mean_mode_imputed_test, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_test.feather")
write_feather(ohe_unimputed_test, "../../dssg-2025-mentor-canada/Data/ohe_unimputed_test.feather")