<- read_csv("../../dssg-2025-mentor-canada/Data/encodedselectall.csv")
youth_orig
head(youth_orig)
5 OHE cleaning, linear reg, and XGBoost
(with dataset cleaning in the beginning)
6 Data Type Cleaning & Unification
6.1 Load cleaned data set with dummy coding (ont-hot encoded) variables:
Outcome variables (11 expected in total):
Q12. High school GED
|
Q44. Help-seeking
|
Q13. Further education/training -> counts at each level
Q13a. Higher education (controlling for age)
|
Q45. Mental health
|
Q14. Employment status
|
Q46. Mental well-being
|
Q41. Career planning
|
Q27. Belonging
|
Q43. Social capital
|
Q50. Volunteering
|
<- c("QS1_19_HIGHSCHOOL", "QS1_20_HIGHSCHOOL")
high_school_ged
<- c("QS1_21_FURTHEDUCA", "QS1_22_HIGHESTEDU", "QS1_23_YEARCOMPLE")
further_educ
<- "QS1_25_EMPLOYMENT"
employment_status <- "QS1_28_EMPLOYMENT_calculated"
yearly_income <- c("QS4_3_CAREERPLANNIN1_1_1", "QS4_3_CAREERPLANNIN1_2_2", "QS4_3_CAREERPLANNIN1_3_3", "QS4_3_CAREERPLANNIN1_4_4", "QS4_3_CAREERPLANNIN1_5_5", "QS4_3_CAREERPLANNIN1_6_6", "QS4_3_CAREERPLANNIN1_7_7", "QS4_3_CAREERPLANNIN1_8_8")
career_planning
<- c("QS4_7_SOCIALCAPITAL1_1_1", "QS4_7_SOCIALCAPITAL1_2_2", "QS4_7_SOCIALCAPITAL1_3_3", "QS4_7_SOCIALCAPITAL1_4_4")
social_capital
<- c("QS4_8_HELPSEEKING1_1_1", "QS4_8_HELPSEEKING1_2_2", "QS4_8_HELPSEEKING1_3_3", "QS4_8_HELPSEEKING1_4_4", "QS4_8_HELPSEEKING1_5_5", "QS4_8_HELPSEEKING1_6_6", "QS4_8_HELPSEEKING1_7_7", "QS4_8_HELPSEEKING1_8_8", "QS4_8_HELPSEEKING1_9_9", "QS4_8_HELPSEEKING1_10_10")
help_seeking <- "QS4_9_MENTALHEALTH"
mental_health <- c("QS4_10_MENTALWELLBE1_1_1", "QS4_10_MENTALWELLBE1_2_2", "QS4_10_MENTALWELLBE1_3_3", "QS4_10_MENTALWELLBE1_4_4", "QS4_10_MENTALWELLBE1_5_5", "QS4_10_MENTALWELLBE1_6_6", "QS4_10_MENTALWELLBE1_7_7")
well_being
<- "QS4_11_BELONGING"
belonging
<- c("QS4_14_FORMALVOL", "QS4_15_TIMEIFFOR1", "QS4_15_TIMEIFFOR2", "QS4_15_TIMEIFFOR3", "QS4_15_TIMEIFFOR4", "QS4_16_FORMALVOL")
volunteering
<- youth_orig |>
youth_iv select(-all_of(c(high_school_ged,
further_educ,
employment_status,
yearly_income,
career_planning,
social_capital,
help_seeking,
mental_health,
well_being,
belonging,
volunteering)))
<- youth_orig |>
youth_dv select(all_of(c(high_school_ged,
further_educ,
employment_status,
yearly_income,
career_planning,
social_capital,
help_seeking,
mental_health,
well_being,
belonging, volunteering)))
6.1.0.1 Ensure all postal code are in proper upper cases:
$geo_postcode_fsa <- toupper(youth_orig$geo_postcode_fsa) youth_orig
6.1.1 Remove pre-processed column (binarized) by previous researchers
<- youth_orig |>
youth select(-c(QS1_8_NEWCOMERYEAR_cat:Week_income))
<- youth |>
youth select(-c(QS2_16_FORMAT_any:total_yearly_income))
6.1.2 Count proportion of empty entries per variable:
<- youth |> nrow()
n
<- youth |> summarize_all(~sum(is.na(.))) |>
na_count_table pivot_longer(cols = everything(), names_to = "columns", values_to = "na_count") |>
mutate(na_percent = na_count / n * 100,
non_na_entries = n - na_count)
<- na_count_table |> filter(na_percent >= 70)
na_count_70_percent head(na_count_70_percent)
# A tibble: 6 × 4
columns na_count na_percent non_na_entries
<chr> <int> <dbl> <int>
1 geo_postcode_fsa 2250 79.3 588
2 QS1_5_INDIGENOUSHS 2544 89.6 294
3 QS1_8_NEWCOMERYEAR 2421 85.3 417
4 QS1_17_INCARE 2752 97.0 86
5 QS1_26_EMPLOYMENT 2437 85.9 401
6 QS1_27_PLANNINGRE 2437 85.9 401
176 variables have >= 70% empty NA entries.
6.1.2.1 Identify highly sparse column (number of entries < 30):
<- na_count_table |>
cols_less_than_30_entires filter(non_na_entries < 30)
print(cols_less_than_30_entires, n = nrow(cols_less_than_30_entires))
$columns
cols_less_than_30_entires
<- na_count_table |>
cols_less_than_90_percent filter(na_percent >= 90)
print(cols_less_than_90_percent, n = nrow(cols_less_than_90_percent))
$columns cols_less_than_90_percent
6.1.3 Remove Irrelevant Columns:
<- youth |>
youth
# Remove text columns:
select(-c("QS2_25_YOUTHINIT1", "QS2_29_MATCHCRITERIA_O", "QS2_39_2_Other",
"QS4_23_PASTFORMA1", "QS2_39_1_Other", "QS2_39_3_Other")) |>
# Remove columns with <30 valid entries:
select(-any_of(cols_less_than_30_entires$columns)) |>
# Remove repeated income columns:
select(-c("QS1_29_EMPLOYMENT", "QS1_30_EMPLOYMENT1", "QS1_30_EMPLOYMENT2",
"QS1_31_EMPLOYMENT", "QS1_32_WEEKLY")) |>
# These columns were selected out in other files, so any_of() is used:
select(-any_of(c("QS2_15_RELATIONSHIP1", "QS2_5_Other", "QS2_17_TYPE_1_Other"))) |>
# Take out postal code--it will be useful only when building geospatial plot:
select(-geo_postcode_fsa) |>
# Text column: Name of organization with with respondent served as a mentor
select(-c("QS4_19_CURRENTME1", "QS4_19_CURRENTME2", "QS4_19_CURRENTME3")) |>
# Text column: name of mentor organization, else unsure (QS4_23_PASTFORMA2) or prefer not say (QS4_23_PASTFORMA3):
select(-c('QS4_23_PASTFORMA2', 'QS4_23_PASTFORMA3')) |>
# This is an 'other' column for text written response:
select(-any_of(c("QS2_34_SUPPORTS1_10_10"))) |>
# text entry:
select(-any_of(c("QS2_26_INITIATIONEV"))) |>
# whether respondent remembered name of organization:
select(-any_of(c("QS2_27_MENTORPROGRA1")))
6.1.3.1 Remove Prefer not to answer
/ Don't know
responses in the following ordinal items:
QS4_9_MENTALHEALTH
"Don’t know"
"Prefer not to answer"
QS4_11_BELONGING
"Don’t know"
"Prefer not to answer"
QS4_20_MENTEEAGE
"Unsure"
"Prefer not to say"
nrow(youth) # 2838
<- youth |>
youth filter(!QS4_9_MENTALHEALTH %in% c("Don’t know", "Prefer not to answer"), # ordinal
!QS4_11_BELONGING %in% c("Don’t know", "Prefer not to answer"), # ordinal
!QS4_20_MENTEEAGE %in% c("Unsure", "Prefer not to say")) # ordinal
nrow(youth) # 2396
# Responses containing "Unsure"
<- tidy(colSums(youth == "Unsure", na.rm = TRUE))|>
unsure_counts mutate(is_unsure = ifelse(x >= 1, TRUE, FALSE)) |>
filter(`is_unsure` == "TRUE")
# Responses containing "Prefer not to say"
<- tidy(colSums(youth == "Prefer not to say", na.rm = TRUE))|>
pns_counts mutate(is_unsure = ifelse(x >= 1, TRUE, FALSE)) |>
filter(`is_unsure`== "TRUE")
head(unsure_counts)
# A tibble: 6 × 3
names x is_unsure
<chr> <dbl> <lgl>
1 QS1_2_PROV 6 TRUE
2 QS1_4_INDIGENOUS 101 TRUE
3 QS1_12_DISABIL 133 TRUE
4 QS1_13_DISABIL 43 TRUE
5 QS1_22_HIGHESTEDU 110 TRUE
6 QS2_1_MEANINGFULP 254 TRUE
head(pns_counts)
# A tibble: 6 × 3
names x is_unsure
<chr> <dbl> <lgl>
1 QS1_2_PROV 14 TRUE
2 QS1_4_INDIGENOUS 75 TRUE
3 QS1_11_SEXUALO 91 TRUE
4 QS1_12_DISABIL 40 TRUE
5 QS1_13_DISABIL 11 TRUE
6 QS2_1_MEANINGFULP 29 TRUE
- Career Planning Variables contain a huge number of ‘Unsure’ responses:
It would be more appropriate to return to this variable later and remove these observations.
To work with the rest of the data, it may serve the analysis better if
QS4_3_CAREERPLANNIN1_1_1
toQS4_3_CAREERPLANNIN1_8_8
will be removed.
QS4_3_CAREERPLANNIN1_1_1: 486
Unsure
responsesQS4_3_CAREERPLANNIN1_2_2: 413
QS4_3_CAREERPLANNIN1_3_3: 508
QS4_3_CAREERPLANNIN1_4_4: 463
QS4_3_CAREERPLANNIN1_5_5: 591
QS4_3_CAREERPLANNIN1_6_6: 354
QS4_3_CAREERPLANNIN1_7_7: 512
QS4_3_CAREERPLANNIN1_8_8: 717
- Other variables containing
Unsure
responses:
- Given the number of
unsure
responses in these variables are small relative to the number of total observation (~2,900), these their corresponding observation would be remove. The resultant dimension of the train ***** data will be examined.
6.2 Identify continuous metrics columns:
QS1_14_DISABIL (9b: At what age did disability / conditions emerge.)
QS1_15_DISABIL (9b1: How many years have the disability affected performance)
Education levels (years) being treated as continuous due to > 7 levels.
QS1_18_PARENTEDUC1
QS1_18_PARENTEDUC2
QS1_20_HIGHSCHOOL
QS1_23_YEARCOMPLE (In what year completed higher education)
Employment wage / salary:
QS1_29_EMPLOYMENT (removed)
QS1_30_EMPLOYMENT1 (removed)
QS1_30_EMPLOYMENT2 (removed)
QS1_31_EMPLOYMENT (removed)
QS1_32_WEEKLY (removed)
Month_income
Semimonth_income
Biweek_income
Week_income
QS1_28_EMPLOYMENT_calculated
Volunteer hours:
QS4_15_TIMEIFFOR1 (Total hour)
QS4_15_TIMEIFFOR2 (Hours per month)
QS4_15_TIMEIFFOR3 (Hours per week)
QS4_15_TIMEIFFOR4 (Hours per day)
Number of mentors:
- QS2_10_NUMBEROFME
# metric variables, not including outcome `QS1_25_EMPLOYMENT`
<- c("QS1_1_AGE", "QS1_8_NEWCOMERYEAR", "QS1_14_DISABIL", "QS1_15_DISABIL", "QS1_20_HIGHSCHOOL", "QS1_23_YEARCOMPLE", "QS2_10_NUMBEROFME", "QS4_15_TIMEIFFOR1", "QS4_15_TIMEIFFOR2", "QS4_15_TIMEIFFOR3", "QS4_15_TIMEIFFOR4", "QS4_12_TRUST1_1_1", "QS4_12_TRUST1_2_2", "QS4_12_TRUST1_3_3", "QS4_12_TRUST1_4_4", "QS4_12_TRUST1_5_5", "QS4_14_FORMALVOL") metric_vars
6.3 Convert all categorical columns into factor data type
- Identify categorical columns:
<- c("QS1_2_PROV", "QS1_3_COMMUNITYTYPE", "QS1_4_INDIGENOUS",
cate_vars "QS1_5_INDIGENOUSHS",
"QS1_7_NEWCOMER", "QS1_10_TRANSUM", "QS1_11_SEXUALO", "QS1_12_DISABIL", "QS1_13_DISABIL",
"QS1_17_INCARE",
"QS1_19_HIGHSCHOOL", "QS1_21_FURTHEDUCA", "QS1_25_EMPLOYMENT",
"QS1_27_PLANNINGRE",
"QS1_28_EMPLOYMENT", "QS2_1_MEANINGFULP", "QS2_2_MEANINGFULP", "QS2_3_PRESENCEOFM", "QS2_4_MENTOR61FOR",
"QS2_5_MENTOR611PR",
"QS2_7_MENTOR611SE", "QS2_8_UNMETNEED61", "QS2_9_PRESENCEOFA", "QS2_11_MENTOR1218", "QS2_12_UNMETNEED1", "QS2_16_FORMAT_1", "QS2_17_TYPE_1", "QS2_18_LOCATION_1",
"QS2_22_GEOLOCATI1",
"QS2_16_FORMAT_2", "QS2_17_TYPE_2", "QS2_18_LOCATION_2",
"QS2_22_GEOLOCATI2",
"QS2_16_FORMAT_3", "QS2_17_TYPE_3", "QS2_18_LOCATION_3",
"QS2_22_GEOLOCATI3",
"QS2_23_MOSTMEANI", "QS2_24_MENTORAGE",
# "QS2_26_INITIATIONEV", # *Removed due to having 99.7% NA entries.
# "QS2_27_MENTORPROGRA1", # Removed because it asks for 'recall of the name of mentor organization'
"QS2_28_MATCHCHOICE", "Match_GenderIdentity", "Match_Ethnicity", "Match_CulturalBackground", "Match_ScheduleAvailability", "Match_Interests", "Match_Goals", "Match_Personalities", "Match_LifeStruggles", "Transition_School", "Transition_NewSchool", "Transition_NewCommunity", "Transition_GettingDriversLicense", "Transition_JobAspirations", "Transition_GettingFirstJob", "Transition_ApplyingToTradeSchool-Collge-Uni", "Transition_IndependenceFromGuardian", "Transition_FundingForTradeSchool-Collge-Uni", "Transition_NoneOfAbove", "QS2_33_TRANSITIONS1_13_13", "QS2_33_TRANSITIONS1_14_14", "QS3_4_LIFEEVENTS1_1_1", "QS3_4_LIFEEVENTS1_2_2", "QS3_4_LIFEEVENTS1_3_3", "QS3_4_LIFEEVENTS1_4_4", "QS3_4_LIFEEVENTS1_5_5", "QS3_4_LIFEEVENTS1_6_6", "QS3_4_LIFEEVENTS1_7_7", "QS3_4_LIFEEVENTS1_8_8", "QS3_4_LIFEEVENTS1_9_9", "QS3_4_LIFEEVENTS1_10_10", "QS3_4_LIFEEVENTS1_11_11", "QS3_4_LIFEEVENTS1_12_12", "QS3_4_LIFEEVENTS1_13_13", "QS3_4_LIFEEVENTS1_14_14", "QS3_4_LIFEEVENTS1_15_15", "QS3_4_LIFEEVENTS1_16_16", "QS3_4_LIFEEVENTS1_17_17", "QS3_4_LIFEEVENTS1_18_18", "QS3_4_LIFEEVENTS1_19_19", "QS3_4_LIFEEVENTS1_20_20", "QS4_1_MEANINGFULPERSON", "QS4_4_EDUCATIONALEXPEC", "QS4_6_DISAPPOINTED", "QS4_13_LIFEEVE1_1_1", "QS4_13_LIFEEVE1_2_2", "QS4_13_LIFEEVE1_3_3", "QS4_13_LIFEEVE1_4_4", "QS4_13_LIFEEVE1_5_5", "QS4_13_LIFEEVE1_6_6", "QS4_16_FORMALVOL", "QS4_17_SERVEDASM", "QS4_18_CURRENTOR",
"QS4_21_MENTORING", "QS4_22_PASTMENTO", "QS4_25_FUTUREMEN",
"QS4_26_INTERNETC", "QS4_27_INTERNETC1_1_1", "QS4_27_INTERNETC1_2_2", "QS4_27_INTERNETC1_3_3", "QS4_27_INTERNETC1_4_4", "QS4_28_INTERNETCON", "QS4_29_PRIVATECONN", "QS4_30_INTERNETCON", "QS4_31_MOBILECONNE", "QS4_32_MOBILECONNE1_1_1", "QS4_32_MOBILECONNE1_2_2", "QS4_32_MOBILECONNE1_3_3", "QS4_32_MOBILECONNE1_4_4", "QS4_33_MOBILECONNECT"
)
<- youth |>
youth mutate(across(all_of(cate_vars), as_factor))
6.4 Identify Likert scale items columns (ordinal)
<- c("QS1_18_PARENTEDUC1", "QS1_18_PARENTEDUC2", "QS2_19_DURATION_1", "QS2_20_EXPERIENCE_1", "QS2_19_DURATION_2", "QS2_20_EXPERIENCE_2", "QS2_19_DURATION_3", "QS2_20_EXPERIENCE_3", "QS2_30_MATCHSIMILAR1_1_1", "QS2_30_MATCHSIMILAR1_2_2", "QS2_30_MATCHSIMILAR1_3_3", "QS2_30_MATCHSIMILAR1_4_4", "QS2_30_MATCHSIMILAR1_5_5", "QS2_31_MENTORINGREL1_1_1", "QS2_31_MENTORINGREL1_2_2", "QS2_31_MENTORINGREL1_3_3", "QS2_31_MENTORINGREL1_4_4", "QS2_31_MENTORINGREL1_5_5", "QS2_32_MENTORINGENG1_1_1", "QS2_32_MENTORINGENG1_2_2", "QS2_32_MENTORINGENG1_3_3", "QS2_32_MENTORINGENG1_4_4", "QS2_32_MENTORINGENG1_5_5", "QS2_32_MENTORINGENG1_6_6", "QS2_32_MENTORINGENG1_7_7", "QS2_32_MENTORINGENG1_8_8", "QS2_32_MENTORINGENG1_9_9", "QS2_32_MENTORINGENG1_10_10", "QS2_32_MENTORINGENG1_11_11", "QS2_32_MENTORINGENG1_12_12", "QS2_32_MENTORINGENG1_13_13", "QS2_32_MENTORINGENG1_14_14", "QS2_32_MENTORINGENG1_15_15", "QS2_32_MENTORINGENG1_16_16", "QS2_32_MENTORINGENG1_17_17", "QS2_32_MENTORINGENG1_18_18", "QS2_32_MENTORINGENG1_19_19", "QS2_32_MENTORINGENG1_20_20", "QS2_32_MENTORINGENG1_21_21", "QS2_32_MENTORINGENG1_22_22", "QS2_35_SUPPORTSIMPO1_1_1", "QS2_35_SUPPORTSIMPO1_2_2", "QS2_35_SUPPORTSIMPO1_3_3", "QS2_35_SUPPORTSIMPO1_4_4", "QS2_35_SUPPORTSIMPO1_5_5", "QS2_35_SUPPORTSIMPO1_6_6", "QS2_35_SUPPORTSIMPO1_7_7", "QS2_35_SUPPORTSIMPO1_8_8", "QS2_35_SUPPORTSIMPO1_9_9", "QS2_35_SUPPORTSIMPO1_10_10", "QS2_37_HELPFULNESS",
ordinal_numeric_vars # "QS_40_REMATCHING_2", "QS_40_REMATCHING_3",
"QS3_1_GLOBALSELFWOR1_1_1", "QS3_1_GLOBALSELFWOR1_2_2", "QS3_1_GLOBALSELFWOR1_3_3", "QS3_1_GLOBALSELFWOR1_4_4", "QS3_1_GLOBALSELFWOR1_5_5", "QS3_1_GLOBALSELFWOR1_6_6", "QS3_1_GLOBALSELFWOR1_7_7", "QS3_1_GLOBALSELFWOR1_8_8", "QS3_5_SCHOOLCLIMATE1_1_1", "QS3_5_SCHOOLCLIMATE1_2_2", "QS3_5_SCHOOLCLIMATE1_3_3", "QS3_5_SCHOOLCLIMATE1_4_4", "QS3_5_SCHOOLCLIMATE1_5_5", "QS3_5_SCHOOLCLIMATE1_6_6", "QS3_5_SCHOOLCLIMATE1_7_7", "QS3_5_SCHOOLCLIMATE1_8_8", "QS3_5_SCHOOLCLIMATE1_9_9", "QS3_5_SCHOOLCLIMATE1_10_10", "QS4_2_MEANINGFULPERSON", "QS4_8_HELPSEEKING1_1_1", "QS4_8_HELPSEEKING1_2_2", "QS4_8_HELPSEEKING1_3_3", "QS4_8_HELPSEEKING1_4_4", "QS4_8_HELPSEEKING1_5_5", "QS4_8_HELPSEEKING1_6_6", "QS4_8_HELPSEEKING1_7_7", "QS4_8_HELPSEEKING1_8_8", "QS4_8_HELPSEEKING1_9_9", "QS4_8_HELPSEEKING1_10_10", "QS4_10_MENTALWELLBE1_1_1", "QS4_10_MENTALWELLBE1_2_2", "QS4_10_MENTALWELLBE1_3_3", "QS4_10_MENTALWELLBE1_4_4", "QS4_10_MENTALWELLBE1_5_5", "QS4_10_MENTALWELLBE1_6_6", "QS4_10_MENTALWELLBE1_7_7"
)
<- c("QS1_22_HIGHESTEDU", "QS2_6_MENTOREXPER", "QS2_34_SUPPORTS1_1_1", "QS2_34_SUPPORTS1_2_2", "QS2_34_SUPPORTS1_3_3", "QS2_34_SUPPORTS1_4_4", "QS2_34_SUPPORTS1_5_5", "QS2_34_SUPPORTS1_6_6", "QS2_34_SUPPORTS1_7_7", "QS2_34_SUPPORTS1_8_8", "QS2_34_SUPPORTS1_9_9", "QS2_36_INFLUENCE1_1_1", "QS2_36_INFLUENCE1_2_2", "QS2_36_INFLUENCE1_3_3", "QS2_36_INFLUENCE1_4_4", "QS2_36_INFLUENCE1_5_5", "QS2_36_INFLUENCE1_6_6", "QS2_36_INFLUENCE1_7_7", "QS2_36_INFLUENCE1_8_8", "QS2_36_INFLUENCE1_9_9", "QS4_3_CAREERPLANNIN1_1_1", "QS4_3_CAREERPLANNIN1_2_2", "QS4_3_CAREERPLANNIN1_3_3", "QS4_3_CAREERPLANNIN1_4_4", "QS4_3_CAREERPLANNIN1_5_5", "QS4_3_CAREERPLANNIN1_6_6", "QS4_3_CAREERPLANNIN1_7_7", "QS4_3_CAREERPLANNIN1_8_8", "QS4_5_SATEDU", "QS4_7_SOCIALCAPITAL1_1_1", "QS4_7_SOCIALCAPITAL1_2_2", "QS4_7_SOCIALCAPITAL1_3_3", "QS4_7_SOCIALCAPITAL1_4_4","QS4_9_MENTALHEALTH", "QS4_11_BELONGING","QS4_20_MENTEEAGE", "QS4_24_FUTUREMEN") ordinal_chr_vars
6.4.1 Handelling Varied Likert Scale Items: Label Alignment Issue
6.4.1.1 Task: Reverse coding for raw ordinal columns with numeric encoding:
For some ordinal columns (numerically encoded ones), there is incongruency in their valance direction. The issue at hand is to ensure higher numeric values will consistently represent the ‘positive’ or ‘high’ end of the scale (e.g., agreement, positivity, truth) across all variables:
Question 11:
QS1_18_PARENTEDUC*
Question 20f:
QS2_19_DURATION_*
- <3 months, 3-6 months, 7-11 months, 12 months or more, It’s still ongoing
Question 20g:
QS2_19_EXPERIENCE_*
- Always positive; Mostly positive; Neutral; Mostly negative; Always negative
Question 26:
QS2_30_MATCHSIMILAR1_*_*
- (1) Disagree (2) Neither agree nor disagree (3) Agree
Question 27:
QS2_31_MENTORINGREL1_*_*
- 1. Strongly disagree 2. Disagree 3. Neither agree nor disagree 4. Agree 5. Strongly agree
Question 28:
QS2_32_MENTORINGENG1_*_*
- 1.Not very true—2.Sometimes true—3.Very true
Question 30:
QS2_35_SUPPORTSIMPO1_6_6
- Not very true—Sometimes true—Very true
Question 36:
QS3_1_GLOBALSELFWOR1_2_2
- (1) Strongly disagree (2) Disagree (3) Agree (4) Strongly agree
Question 39:
QS3_5_SCHOOLCLIMATE1_*_*
- Strongly Agree——- A——-D——–Strongly Disagree
Question44:
QS4_8_HELPSEEKING1_*_*
- (1) Not at all true (2) (3) (4) (5) Completely true
Question 46
QS4_10_MENTALWELLBE1_*_*
:- (1) None of the time — (2) Rarely — (3) Some of the time — (4) Often — (5) All of the time
Steps: QS2_19_EXPERIENCE_
(1-5 scale: Always positive to Always negative) and QS3_5_SCHOOLCLIMATE1_
(1-4 scale: Strongly Agree to Strongly Disagree) require reverse coding.
<- youth |>
reversed_coding mutate(# Reverse QS2_19_EXPERIENCE_* (1-5 scale: Always positive to Always negative)
across(matches("^QS2_19_EXPERIENCE_"), ~ 6 - ., .names = "reversed_{.col}"),
# Reverse QS3_5_SCHOOLCLIMATE1_*_* (1-4 scale: Strongly Agree to Strongly Disagree)
across(matches("^QS3_5_SCHOOLCLIMATE1_"), ~ 5 - ., .names = "reversed_{.col}"))
# Validate reversed coding:
|>
reversed_coding select(starts_with("QS2_19_EXPERIENCE_"), starts_with("reversed_QS2_19_EXPERIENCE_"),
starts_with("QS3_5_SCHOOLCLIMATE1_"), starts_with("reversed_QS3_5_SCHOOLCLIMATE1_")) %>%
head()
# A tibble: 6 × 20
QS3_5_SCHOOLCLIMATE1_1_1 QS3_5_SCHOOLCLIMATE1_2_2 QS3_5_SCHOOLCLIMATE1_3_3
<dbl> <dbl> <dbl>
1 4 4 4
2 1 2 2
3 1 1 1
4 3 2 2
5 3 3 3
6 2 2 3
# ℹ 17 more variables: QS3_5_SCHOOLCLIMATE1_4_4 <dbl>,
# QS3_5_SCHOOLCLIMATE1_5_5 <dbl>, QS3_5_SCHOOLCLIMATE1_6_6 <dbl>,
# QS3_5_SCHOOLCLIMATE1_7_7 <dbl>, QS3_5_SCHOOLCLIMATE1_8_8 <dbl>,
# QS3_5_SCHOOLCLIMATE1_9_9 <dbl>, QS3_5_SCHOOLCLIMATE1_10_10 <dbl>,
# reversed_QS3_5_SCHOOLCLIMATE1_1_1 <dbl>,
# reversed_QS3_5_SCHOOLCLIMATE1_2_2 <dbl>,
# reversed_QS3_5_SCHOOLCLIMATE1_3_3 <dbl>, …
reversed_coding
shows thatreversed_QS2_19_EXPERIENCE_
andreversed_QS3_5_SCHOOLCLIMATE1_
are successful reverse encoding ofQS2_19_EXPERIENCE_
andQS3_5_SCHOOLCLIMATE1_
!Now, let’s apply the this reverse encoding onto the original
youth
data set, replacing theQS2_19_EXPERIENCE_
andQS3_5_SCHOOLCLIMATE1_
directly in the original data frame:
<- youth |>
youth mutate(# Reverse QS2_19_EXPERIENCE_* (1-5 scale: Always positive to Always negative)
across(matches("^QS2_19_EXPERIENCE_"), ~ 6 - .),
# Reverse QS3_5_SCHOOLCLIMATE1_*_* (1-4 scale: Strongly Agree to Strongly Disagree)
across(matches("^QS3_5_SCHOOLCLIMATE1_"), ~ 5 - .))
# Once again, validate result:
|>
youth select(starts_with("QS2_19_EXPERIENCE_"), starts_with("QS3_5_SCHOOLCLIMATE1_")) |>
head()
# A tibble: 6 × 10
QS3_5_SCHOOLCLIMATE1_1_1 QS3_5_SCHOOLCLIMATE1_2_2 QS3_5_SCHOOLCLIMATE1_3_3
<dbl> <dbl> <dbl>
1 1 1 1
2 4 3 3
3 4 4 4
4 2 3 3
5 2 2 2
6 3 3 2
# ℹ 7 more variables: QS3_5_SCHOOLCLIMATE1_4_4 <dbl>,
# QS3_5_SCHOOLCLIMATE1_5_5 <dbl>, QS3_5_SCHOOLCLIMATE1_6_6 <dbl>,
# QS3_5_SCHOOLCLIMATE1_7_7 <dbl>, QS3_5_SCHOOLCLIMATE1_8_8 <dbl>,
# QS3_5_SCHOOLCLIMATE1_9_9 <dbl>, QS3_5_SCHOOLCLIMATE1_10_10 <dbl>
6.4.2 Convert all string/character Likert scale items columns into ordinal data type:
<- youth |>
youth mutate(across(all_of(ordinal_numeric_vars), as_factor)) # converts ordinal items encoded in either (1) numbers or (2) strings into factor.
<- youth |>
youth mutate(across(all_of(ordinal_chr_vars), as_factor))
6.4.3 Convert string/character Likert scale level columns into ordered numeric:
- Specify string/character-labeled Likert levels:
<- list(levels = 1:5, labels = c("Always negative", "Mostly negative", "Somewhat positive", "Mostly positive", "Always positive"))
experience_ord <- list(levels = 1:3, labels = c("Not very true","Sometimes true","Very true"))
support_ord <- list(levels = 1:4, labels = c("A little","Some","Quite a bit","A lot")) # No 'None' in response range
influence_ord <- list(levels = 1:7, labels = c("Completely disagree","Disagree","Somewhat disagree","Unsure","Somewhat agree","Agree","Completely agree"))
career_plan_ord <- list(levels = 1:5, labels = c("Strongly disagree","Disagree","Neutral","Agree","Strongly Agree"))
social_capital_ord <- list(levels = 1:5, labels = c("Poor","Fair","Good","Very good", "Excellent"))
mental_health_ord # "Don't know" / 'Prefer not to answer' is not included (previously filtered)
<- list(levels = 1:4, labels = c("Very weak","Somewhat weak","Somewhat strong","Very strong"))
belonging_ord <- list(levels = 1:4, labels = c("Under 18 years old","18-25 years old","26-29 years old","30 years old or older"))
mentee_age_ord <- list(levels = 1:4, labels = c("Not interested at all", "Not that interested","Fairly interested","Very interested"))
mentoring_interest_ord
<- list(levels = 1:11, labels = c("Certificate of Apprenticeship or Certificate of Qualification",
high_edu_ord "Other trades certificate or diploma",
"College, CEGEP or other non-university certificate or diploma",
"University diploma or certificate BELOW Bachelor’s Degree",
"Bachelor’s degree (e.g. BA, BSc, B.Ed., B.Eng including LL.B-law degree)",
"University diploma or certificate ABOVE Bachelor’s Degree",
"First professional degree (e.g. medicine, dentistry, veterinary medicine, optometry)",
"Master's degree (e.g. M.B.A., M.A., M.Sc.)",
"Doctorate degree (earned e.g. Ph.D, D.Sc., D.Ed) or post-doctoral program",
"Other, please specify",
"Unsure"))
- Compile all character-labeled Likert items in a list (i.e., a list of list in this case).
<- list(QS2_6_MENTOREXPER = experience_ord,
lst_of_ord_cols
QS2_34_SUPPORTS1_1_1 = support_ord,
QS2_34_SUPPORTS1_2_2 = support_ord,
QS2_34_SUPPORTS1_3_3 = support_ord,
QS2_34_SUPPORTS1_4_4 = support_ord,
QS2_34_SUPPORTS1_5_5 = support_ord,
QS2_34_SUPPORTS1_6_6 = support_ord,
QS2_34_SUPPORTS1_7_7 = support_ord,
QS2_34_SUPPORTS1_8_8 = support_ord,
QS2_34_SUPPORTS1_9_9 = support_ord,
QS2_36_INFLUENCE1_1_1 = influence_ord,
QS2_36_INFLUENCE1_2_2 = influence_ord,
QS2_36_INFLUENCE1_3_3 = influence_ord,
QS2_36_INFLUENCE1_4_4 = influence_ord,
QS2_36_INFLUENCE1_5_5 = influence_ord,
QS2_36_INFLUENCE1_6_6 = influence_ord,
QS2_36_INFLUENCE1_7_7 = influence_ord,
QS2_36_INFLUENCE1_8_8 = influence_ord,
QS2_36_INFLUENCE1_9_9 = influence_ord,
QS4_3_CAREERPLANNIN1_1_1 = career_plan_ord,
QS4_3_CAREERPLANNIN1_2_2 = career_plan_ord,
QS4_3_CAREERPLANNIN1_3_3 = career_plan_ord,
QS4_3_CAREERPLANNIN1_4_4 = career_plan_ord,
QS4_3_CAREERPLANNIN1_5_5 = career_plan_ord,
QS4_3_CAREERPLANNIN1_6_6 = career_plan_ord,
QS4_3_CAREERPLANNIN1_7_7 = career_plan_ord,
QS4_3_CAREERPLANNIN1_8_8 = career_plan_ord,
QS4_7_SOCIALCAPITAL1_1_1 = social_capital_ord,
QS4_7_SOCIALCAPITAL1_2_2 = social_capital_ord,
QS4_7_SOCIALCAPITAL1_3_3 = social_capital_ord,
QS4_7_SOCIALCAPITAL1_4_4 = social_capital_ord,
QS4_9_MENTALHEALTH = mental_health_ord,
QS4_11_BELONGING = belonging_ord,
QS4_20_MENTEEAGE = mentee_age_ord,
QS4_24_FUTUREMEN = mentoring_interest_ord,
QS1_22_HIGHESTEDU = high_edu_ord,
QS4_5_SATEDU = high_edu_ord)
- Write a function to convert all character labels to the correct ordinal numeric scores!
6.4.3.0.1 Function 1: For converting string type ordinal leveled columns into well-ordered integers:
<- function(data, lst_ord_level_cols, lst_col_names) {
convert_string_ord_fac <- data |>
data mutate(across(.cols = all_of(lst_col_names),
.fns = ~ {
<- cur_column()
col_name <- lst_ord_level_cols[[col_name]]
scale factor(.x,
levels = scale$labels, # the string lebels are the 'levels'
labels = scale$levels, # now we want the `labels` to be numeric, according to the levels.
ordered = TRUE)}))
return(data)}
Apply above function
convert_ord_fac()
|> select(all_of(c(ordinal_chr_vars))) |> str() # before conversion
youth <- convert_string_ord_fac(youth, lst_of_ord_cols, ordinal_chr_vars)
youth
# youth |> select(all_of(c(ordinal_chr_vars))) |> str() # after conversion
# Validated! Success!
6.4.3.0.2 Function 2: For converting numeric type ordinal leveled columns into ordered factor
<- function(data, lst_col_names) {
convert_number_ord_fac <- lapply(data[lst_col_names], function(x) {
data[lst_col_names] factor(x, levels = sort(unique(x)), ordered = TRUE)
})
return(data)}
Apply above Function 2 to
ordinal_numeric_vars
|> select(all_of(c(ordinal_numeric_vars))) |> str() # before conversion
youth
<- convert_number_ord_fac(youth, ordinal_numeric_vars)
youth # youth |> select(all_of(c(ordinal_numeric_vars))) |> str() # after conversion
# Validated! Another success!
6.4.4 Unsure
response entries in ordinal columns:
6.4.4.1 QS4_3_CAREERPLANNIN1_1_1
to QS4_3_CAREERPLANNIN1_8_8
: The only ordinal column with Unsure
entries
- The following table examines number of
Unsure
entries here: 40 “Unsure” responses:
# Summary number of `Unsure` responses for QS4_3_CAREERPLANNIN:
bind_cols(list(count_career_1, count_career_2, count_career_3, count_career_4, count_career_5, count_career_6, count_career_7, count_career_8))
# A tibble: 1 × 8
count_career_1 count_career_2 count_career_3 count_career_4 count_career_5
<int> <int> <int> <int> <int>
1 342 308 380 343 442
# ℹ 3 more variables: count_career_6 <int>, count_career_7 <int>,
# count_career_8 <int>
Decision: We reason that since Unsure
in the context of this question serve similar to a ‘undecided’ stance between the two Likert end, we would retain Unsure
for this ordinal item.
6.4.5 Unsure
and Other, please specify
entries in QS4_4_EDUCATIONALEXPEC
Other, please specify
= 10Unsure
= 11
<- c("QS1_22_HIGHESTEDU", "QS4_4_EDUCATIONALEXPEC") # 10 = Other, 11 = Unsure
cat_unsure_pnts_10 |>
youth select(QS1_22_HIGHESTEDU, QS4_4_EDUCATIONALEXPEC) |>
mutate(across(everything(), as.numeric)) |>
filter(QS1_22_HIGHESTEDU > 10) |>
summarize(count_high_edu_na = n())
# A tibble: 1 × 1
count_high_edu_na
<int>
1 110
A total of 110 respondents who had answered either
Unsure
orOther, please specify
for question 13a (QS1_22_HIGHESTEDU
).
|>
youth select(QS1_22_HIGHESTEDU, QS4_4_EDUCATIONALEXPEC) |>
mutate(across(everything(), as.numeric)) |>
filter(QS4_4_EDUCATIONALEXPEC >= 10) |>
summarize(count_high_edu_na = n())
# A tibble: 1 × 1
count_high_edu_na
<int>
1 91
A total of 91 respondents who had answered either
Unsure
orOther, please specify
for question 42 (QS4_4_EDUCATIONALEXPEC
).Decision: We choose retain
Unsure
for this ordinal item.
6.4.6 Remove *_PreferNotToSay
, *_Unsure
, or *_Other
OHE’d categorical variables directly from raw data:
Downstream Plan 1: These
*_PreferNotToSay
,*_Unsure
, and*_Other
ending variables are already in the raw dataframe. The subsequent one-hot encoding we’ll perform on other nominal variables that had not been properly OHE’d will go through the same removal once again.Downstream Plan 2: There are binary (
Yes
orNo
) responses to some of the categorical columns as well. After the downstream OHE, theNo
column corresponding to those binary columns will be removed.
|>
youth select((ends_with(c("_PreferNotToSay", "_Unsure", "_Other")))) |>
ncol()
[1] 21
# Removal:
<- youth |>
youth select(-(ends_with(c("_PreferNotToSay", "_Unsure", "_Other"))))
There are 21 columns of
*_PreferNotToSay
,*_Unsure
, or*_Other
categorical variables, and these columns have been removed.
7 Data train-test split
<- initial_split(youth, prop = .80, strata = QS1_28_EMPLOYMENT_calculated)
split <- training(split)
train_data <- testing(split) test_data
7.0.1 Impute missing (NA) entries in ordinal columns:
- Imputation should be done after train/test data splitting to prevent test data leakage into the train set!
<- recipes::recipe(QS1_28_EMPLOYMENT_calculated ~ ., data = train_data) |>
preprocessing_recipe # step_impute_knn(all_of(c(all_nominal_predictors(),
# all_numeric_predictors())),
# -all_of(metric_vars), neighbors = 5) |> # KNN Imputation is too computational time consuming for this large dataset without special handling-- we pass it just for now.
step_impute_mode(all_ordered_predictors()) |>
step_mutate_at( all_ordered_predictors(), fn = as.numeric) |>
step_impute_mode(all_nominal_predictors()) |>
# cate_vars are those vars that were categorical but not ohe.
step_dummy(any_of(all_nominal_predictors()), one_hot = TRUE) |>
# Standardize continuous variables:
step_center(all_of(metric_vars)) |>
step_scale(all_of(metric_vars)) |>
# Mean impute continuous variables:
step_impute_mean(all_of(metric_vars)) |>
step_impute_median(all_numeric_predictors(), -all_of(metric_vars)) |>
# Impute income output with its median
step_impute_median(all_outcomes())
Comment:
- Code viewers can run the following code snippet in the console to see concise count of each ordinal level! :
mean_mode_imputed_train |> select(all_of(c(ordinal_numeric_vars, ordinal_chr_vars))) |> summary() mean_mode_imputed_train |> select(all_of(ordinal_chr_vars)) |> summary()
- Continuous variables are standardized.
<- preprocessing_recipe |>
mean_mode_impute_prep prep()
<- bake(mean_mode_impute_prep, new_data = NULL)
mean_mode_imputed_train <- bake(mean_mode_impute_prep, new_data = test_data) mean_mode_imputed_test
7.0.2 Only one-hot encode the nominal columns without imputation:
- Continuous variables are standardized.
<- recipes::recipe(QS1_28_EMPLOYMENT_calculated ~ ., data = train_data) |>
preprocessing_recipe_ohe_unimputed step_dummy(all_of(cate_vars), one_hot = TRUE) |>
step_center(all_of(metric_vars)) |>
step_scale(all_of(metric_vars))
<- preprocessing_recipe_ohe_unimputed |>
ohe_unimpute_prep prep()
<- bake(ohe_unimpute_prep, new_data = NULL)
ohe_unimputed_train <- bake(ohe_unimpute_prep, new_data = test_data) ohe_unimputed_test
Validate post-ohe, column names are the same between mean_mode_imputed_train and
ohe_unimputed_train
:
identical(ncol(ohe_unimputed_train), ncol(mean_mode_imputed_train))# Outputs TRUE! They are the same!
[1] TRUE
identical(nrow(ohe_unimputed_train), nrow(mean_mode_imputed_train))# Outputs TRUE! They are the same!
[1] TRUE
nrow(ohe_unimputed_train)
[1] 1915
nrow(mean_mode_imputed_train)
[1] 1915
nrow(youth)
[1] 2396
ncol(mean_mode_imputed_train)
[1] 699
ncol(ohe_unimputed_train)
[1] 699
ncol(youth)
[1] 401
7.1 Examine new one-hot encoded columns:
Task 1: Let’s look at how many new one-hot encoded columns contain *_PreferNotToSay
, *_Unsure
, and *_Other
endings, which need to be removed.
Task 2: Also look at how many new ohe’d columns contain the ending of: *_No
and numeric encoding for *_PreferNotToSay
, *_Unsure
, and *_Other.
7.1.0.1 Remove *_PreferNotToSay
, *_Prefer.not.to.answer
, *_Unsure
, and *_Other
:
|>
mean_mode_imputed_train select((ends_with(c("_PreferNotToSay", "_Prefer.not.to.say", "_Prefer.not.to.answer", "_Unsure", "_Other")))) |>
ncol()
[1] 43
|>
ohe_unimputed_train select((ends_with(c("_PreferNotToSay", "_Prefer.not.to.say", "_Prefer.not.to.answer", "_Unsure", "_Other")))) |>
ncol()
[1] 43
Both imputed and unimputed OHE’d train datasets output 24, meaning there are 24 columns of “*_PreferNotToSay”, “_Unsure”, “_Other”.
# Removals (train):
<- mean_mode_imputed_train |>
mean_mode_imputed_train select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))
<- ohe_unimputed_train |>
ohe_unimputed_train select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))
# Removals (test):
<- mean_mode_imputed_test |>
mean_mode_imputed_test select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))
<- ohe_unimputed_test |>
ohe_unimputed_test select(-(ends_with(c("_PreferNotToSay", "_Prefer.not.to.answer", "_Prefer.not.to.say", "_Unsure", "_Other"))))
7.1.0.2 Remove *_No
|>
mean_mode_imputed_train select((contains(c("_No")))) |>
ncol()
[1] 47
|>
mean_mode_imputed_train select((ends_with(c("_No")))) |>
ncol()
[1] 38
|>
ohe_unimputed_train select((ends_with(c("_No")))) |>
ncol()
[1] 38
Both imputed and unimputed OHE’d train datasets output 38, meaning there are 38 columns of “*_No”.
# Removals (train):
<- mean_mode_imputed_train |>
mean_mode_imputed_train select(-(contains(c("_No"))))
<- ohe_unimputed_train |>
ohe_unimputed_train select(-(contains(c("_No"))))
# Removals (test):
<- mean_mode_imputed_test |>
mean_mode_imputed_test select(-(contains(c("_No"))))
<- ohe_unimputed_test |>
ohe_unimputed_test select(-(contains(c("_No"))))
7.1.0.3 Remove numerically encoded *_PreferNotToSay
, *_Unsure
, *_Other
, and *_No
:
7.1.0.4
_PreferNotToSay
: For convenience, we sometimes use ‘pnts’ short for Prefer Not to Say.
# >=2 is no/unsure/pnts:
<- c("QS1_5_INDIGENOUSHS", "QS1_17_INCARE", "QS1_27_PLANNINGRE",
cat_unsure_pnts_2_or_more "QS2_5_MENTOR611PR", "QS2_7_MENTOR611SE", "QS2_11_MENTOR1218",
'QS2_22_GEOLOCATI1', 'QS2_22_GEOLOCATI2', 'QS2_22_GEOLOCATI3',
'QS3_4_LIFEEVENTS1_1_1', 'QS3_4_LIFEEVENTS1_2_2', 'QS3_4_LIFEEVENTS1_3_3',
'QS3_4_LIFEEVENTS1_4_4', 'QS3_4_LIFEEVENTS1_5_5', 'QS3_4_LIFEEVENTS1_6_6',
'QS3_4_LIFEEVENTS1_7_7', 'QS3_4_LIFEEVENTS1_8_8', 'QS3_4_LIFEEVENTS1_9_9',
'QS3_4_LIFEEVENTS1_10_10', 'QS3_4_LIFEEVENTS1_11_11', 'QS3_4_LIFEEVENTS1_12_12',
'QS3_4_LIFEEVENTS1_13_13', 'QS3_4_LIFEEVENTS1_14_14', 'QS3_4_LIFEEVENTS1_15_15',
'QS3_4_LIFEEVENTS1_16_16', 'QS3_4_LIFEEVENTS1_17_17', 'QS3_4_LIFEEVENTS1_18_18',
'QS3_4_LIFEEVENTS1_19_19', 'QS3_4_LIFEEVENTS1_20_20', 'QS4_6_DISAPPOINTED',
'QS4_13_LIFEEVE1_1_1', 'QS4_13_LIFEEVE1_2_2', 'QS4_13_LIFEEVE1_3_3',
'QS4_13_LIFEEVE1_4_4', 'QS4_13_LIFEEVE1_5_5', 'QS4_13_LIFEEVE1_6_6',
'QS4_16_FORMALVOL', 'QS4_26_INTERNETC',
'QS4_28_INTERNETCON', 'QS4_31_MOBILECONNE', 'QS4_33_MOBILECONNECT')
<- c("QS4_1_MEANINGFULPERSON",
cat_unsure_pnts_3 "QS4_21_MENTORING",
"QS4_29_PRIVATECONN"
# >=3 is no/unsure/pnts
)
<- c("QS4_22_PASTMENTO",
cat_unsure_pnts_4 "QS2_17_TYPE_1",
"QS2_17_TYPE_2",
"QS2_17_TYPE_3",
"QS2_18_LOCATION_1",
"QS2_18_LOCATION_2",
"QS2_18_LOCATION_3",
"QS4_30_INTERNETCON") # >=4 is unsure/pnts
<- c("QS4_4_EDUCATIONALEXPEC") # >=X11 is unsure/pnts
cat_unsure_pnts_11
<- c("QS4_27_INTERNETC1_1_1", "QS4_27_INTERNETC1_2_2", "QS4_27_INTERNETC1_3_3", "QS4_27_INTERNETC1_4_4", "QS4_32_MOBILECONNE1_1_1", "QS4_32_MOBILECONNE1_2_2", "QS4_32_MOBILECONNE1_3_3", "QS4_32_MOBILECONNE1_4_4")
cat_X0_no
# *** also need to remove QS4_30_INTERNETCON_X2 independently
Variables listed in
cat_unsure_pnts_3_or_more
containUnsure
and/orPrefer not to say
at encodingUnsure = 3
andPrefer not to say = 4
.Variables listed in
cat_unsure_pnts_4
containUnsure
and/orPrefer not to say
at encodingUnsure = 4
and/orPrefer not to say = 5
.Variables listed in
cat_unsure_pnts_11
containUnsure
and/orPrefer not to say
at encodingUnsure = 11
andPrefer not to say = 12
.Internet access variables listed in
cat_internet_access
containUnsure
and/orPrefer not to say
at encodingUnsure = 3
andPrefer not to say = 4
.
# Removal of cat_unsure_pnts_3_or_more (train)
<- mean_mode_imputed_train |>
mean_mode_imputed_train select(-matches(paste0(
"(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
"(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
"(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
"(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
"QS4_30_INTERNETCON_X2"
)))
<- mean_mode_imputed_train |>
mean_mode_imputed_train select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11
# Verify removal:
<- colnames(mean_mode_imputed_train)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(mean_mode_imputed_train))]
remaining_cols
if (length(remaining_cols) > 0) {
message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
print(remaining_cols)
else {
} message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
}
[1] "QS2_17_TYPE_1_X2" "QS2_17_TYPE_1_X3"
[3] "QS2_18_LOCATION_1_X2" "QS2_18_LOCATION_1_X3"
[5] "QS2_17_TYPE_2_X2" "QS2_17_TYPE_2_X3"
[7] "QS2_18_LOCATION_2_X2" "QS2_18_LOCATION_2_X3"
[9] "QS2_17_TYPE_3_X2" "QS2_17_TYPE_3_X3"
[11] "QS2_18_LOCATION_3_X2" "QS2_18_LOCATION_3_X3"
[13] "QS2_23_MOSTMEANI_X2" "QS2_23_MOSTMEANI_X3"
[15] "QS4_1_MEANINGFULPERSON_X2" "QS4_4_EDUCATIONALEXPEC_X2"
[17] "QS4_4_EDUCATIONALEXPEC_X3" "QS4_4_EDUCATIONALEXPEC_X4"
[19] "QS4_4_EDUCATIONALEXPEC_X5" "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2" "QS4_22_PASTMENTO_X2"
[23] "QS4_22_PASTMENTO_X3" "QS4_29_PRIVATECONN_X2"
[25] "QS4_30_INTERNETCON_X3"
# Removal of ohe_unimputed_train (train)
<- ohe_unimputed_train |>
ohe_unimputed_train select(-matches(paste0(
"(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
"(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
"(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
"(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
"QS4_30_INTERNETCON_X2"
)))
<- ohe_unimputed_train |>
ohe_unimputed_train select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11
# Verify removal:
<- colnames(ohe_unimputed_train)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(ohe_unimputed_train))]
remaining_cols
if (length(remaining_cols) > 0) {
message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
print(remaining_cols)
else {
} message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
}
[1] "QS2_17_TYPE_1_X2" "QS2_17_TYPE_1_X3"
[3] "QS2_18_LOCATION_1_X2" "QS2_18_LOCATION_1_X3"
[5] "QS2_17_TYPE_2_X2" "QS2_17_TYPE_2_X3"
[7] "QS2_18_LOCATION_2_X2" "QS2_18_LOCATION_2_X3"
[9] "QS2_17_TYPE_3_X2" "QS2_17_TYPE_3_X3"
[11] "QS2_18_LOCATION_3_X2" "QS2_18_LOCATION_3_X3"
[13] "QS2_23_MOSTMEANI_X2" "QS2_23_MOSTMEANI_X3"
[15] "QS4_1_MEANINGFULPERSON_X2" "QS4_4_EDUCATIONALEXPEC_X2"
[17] "QS4_4_EDUCATIONALEXPEC_X3" "QS4_4_EDUCATIONALEXPEC_X4"
[19] "QS4_4_EDUCATIONALEXPEC_X5" "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2" "QS4_22_PASTMENTO_X2"
[23] "QS4_22_PASTMENTO_X3" "QS4_29_PRIVATECONN_X2"
[25] "QS4_30_INTERNETCON_X3"
# Removal of cat_unsure_pnts_3_or_more (test)
<- mean_mode_imputed_test |>
mean_mode_imputed_test select(-matches(paste0(
"(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
"(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
"(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
"(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
"QS4_30_INTERNETCON_X2"
)))
<- mean_mode_imputed_test |>
mean_mode_imputed_test select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11
# Verify removal:
<- colnames(mean_mode_imputed_test)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(mean_mode_imputed_test))]
remaining_cols
if (length(remaining_cols) > 0) {
message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
print(remaining_cols)
else {
} message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
}
[1] "QS2_17_TYPE_1_X2" "QS2_17_TYPE_1_X3"
[3] "QS2_18_LOCATION_1_X2" "QS2_18_LOCATION_1_X3"
[5] "QS2_17_TYPE_2_X2" "QS2_17_TYPE_2_X3"
[7] "QS2_18_LOCATION_2_X2" "QS2_18_LOCATION_2_X3"
[9] "QS2_17_TYPE_3_X2" "QS2_17_TYPE_3_X3"
[11] "QS2_18_LOCATION_3_X2" "QS2_18_LOCATION_3_X3"
[13] "QS2_23_MOSTMEANI_X2" "QS2_23_MOSTMEANI_X3"
[15] "QS4_1_MEANINGFULPERSON_X2" "QS4_4_EDUCATIONALEXPEC_X2"
[17] "QS4_4_EDUCATIONALEXPEC_X3" "QS4_4_EDUCATIONALEXPEC_X4"
[19] "QS4_4_EDUCATIONALEXPEC_X5" "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2" "QS4_22_PASTMENTO_X2"
[23] "QS4_22_PASTMENTO_X3" "QS4_29_PRIVATECONN_X2"
[25] "QS4_30_INTERNETCON_X3"
# Removal of ohe_unimputed_train (test)
<- ohe_unimputed_test |>
ohe_unimputed_test select(-matches(paste0(
"(", paste(cat_unsure_pnts_2_or_more, collapse = "|"), ")_(X2|X3|X4)$|",
"(", paste(cat_unsure_pnts_3, collapse = "|"), ")_(X3|X4|X5)$|",
"(", paste(cat_unsure_pnts_4, collapse = "|"), ")_(X4|X5)$|",
"(", paste(cat_X0_no, collapse = "|"), ")_(X0)$|",
"QS4_30_INTERNETCON_X2"
)))
<- ohe_unimputed_test |>
ohe_unimputed_test select(-(ends_with(c("X_11")))) # cat_unsure_pnts_11
# Verify removal:
<- colnames(ohe_unimputed_test)[grepl("_X0$|_X2$|_X3$|_X4$|_X5$|_X11$|_X_11$", colnames(ohe_unimputed_test))]
remaining_cols
if (length(remaining_cols) > 0) {
message("Warning: The following columns with _X0, _X2, _X3, _X4, _X5, _X11, or _X_11 remain:")
print(remaining_cols)
else {
} message("All specified _X0, _X2, _X3, _X4, _X5, _X11, and _X_11 columns were successfully removed.") # correct!~
}
[1] "QS2_17_TYPE_1_X2" "QS2_17_TYPE_1_X3"
[3] "QS2_18_LOCATION_1_X2" "QS2_18_LOCATION_1_X3"
[5] "QS2_17_TYPE_2_X2" "QS2_17_TYPE_2_X3"
[7] "QS2_18_LOCATION_2_X2" "QS2_18_LOCATION_2_X3"
[9] "QS2_17_TYPE_3_X2" "QS2_17_TYPE_3_X3"
[11] "QS2_18_LOCATION_3_X2" "QS2_18_LOCATION_3_X3"
[13] "QS2_23_MOSTMEANI_X2" "QS2_23_MOSTMEANI_X3"
[15] "QS4_1_MEANINGFULPERSON_X2" "QS4_4_EDUCATIONALEXPEC_X2"
[17] "QS4_4_EDUCATIONALEXPEC_X3" "QS4_4_EDUCATIONALEXPEC_X4"
[19] "QS4_4_EDUCATIONALEXPEC_X5" "QS4_4_EDUCATIONALEXPEC_X11"
[21] "QS4_21_MENTORING_X2" "QS4_22_PASTMENTO_X2"
[23] "QS4_22_PASTMENTO_X3" "QS4_29_PRIVATECONN_X2"
[25] "QS4_30_INTERNETCON_X3"
7.1.1 Remove Possible Dependent Variables of Interest
Data leakage is the idea of inadvertently disclosing (or leaking) information about the dependent outcome variables (or the testing data set) to predictor variables in the training set. If we allow the outcome variables to exert antecedent influences on their predictors, and subsequently use the very same predictors to predict those outcome variables, we would end up in a circular reasoning fallacy, and this can lead to falsely inflated relationship between predictors and outcomes.
To avoid data leakage, we need to carefully remove all possible dependent variables we might use as outcomes, especially BEFORE we perform KNN imputation.
- KNN imputation assess the relationship between variables to approximate missing values. If dependent variables (DVs) are included, imputed values for the independent variables (IVs) will be influenced by the DVs, leading to biased relationship between the IV and the DV.
Decision: We will passthrough the possible DVs during the KNN imputation pipeline in python.
|>
ohe_unimputed_train select(starts_with(c('QS1_19_HIGHSCHOOL', 'QS4_19_CURRENTME', 'QS4_18_CURRENTOR',
'QS1_20_HIGHSCHOOL', 'QS1_21_FURTHEDUCA', 'QS1_22_HIGHESTEDU',
'QS1_23_YEARCOMPLE', 'QS1_25_EMPLOYMENT', 'QS1_26_EMPLOYMENT',
'QS1_27_PLANNINGRE', 'QS1_28_EMPLOYMENT', 'QS4_4_EDUCATIONALEXPEC',
'QS4_5_SATEDU','QS4_6_DISAPPOINTED', # these are self-motivation variable.
'QS4_7_SOCIALCAPITAL', 'QS4_8_HELPSEEKING', 'QS4_9_MENTALHEALTH',
'QS4_10_MENTALWELLBE', 'QS4_11_BELONGING', 'QS4_12_TRUST',
'QS4_16_FORMALVOL', 'QS4_25_FUTUREMEN', 'QS4_21_MENTORING',
'QS4_17_SERVEDASM', 'QS4_18_CURRENTOR', 'QS4_22_PASTMENTO',
'QS4_1_MEANINGFULPERSON', 'QS4_13_LIFEEVE'))) |>
summarize(across(everything(), ~sum(is.na(.)))) |>
glimpse()
8 Data frame export:
8.0.0.1 Export train/test data sets
As comma separated file:
# write_csv(train_data, "../../dssg-2025-mentor-canada/Data/umimputed_train.csv")
# write_csv(ohe_unimputed_train, "../../dssg-2025-mentor-canada/Data/ohe_unimputed_train.csv")
# write_csv(test_data, "../../dssg-2025-mentor-canada/Data/umimputed_test.csv")
#
# write_csv(mean_mode_imputed_train, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.csv")
# write_csv(mean_mode_impute_test, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_test.csv")
As R object (to retain the transformed data types)
# write_rds(train_data, "../../dssg-2025-mentor-canada/Data/umimputed_train.Rds")
# write_rds(test_data, "../../dssg-2025-mentor-canada/Data/umimputed_test.Rds")
# write_rds(mean_mode_imputed_train, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.Rds")
- Alternative option: Try use
Feature
to preserve transformed data type:
write_feather(train_data, "../../dssg-2025-mentor-canada/Data/umimputed_train.feather")
write_feather(mean_mode_imputed_train, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.feather")
write_feather(ohe_unimputed_train, "../../dssg-2025-mentor-canada/Data/ohe_unimputed_train.feather")
write_feather(test_data, "../../dssg-2025-mentor-canada/Data/umimputed_test.feather")
write_feather(mean_mode_imputed_test, "../../dssg-2025-mentor-canada/Data/mean_mode_imputed_test.feather")
write_feather(ohe_unimputed_test, "../../dssg-2025-mentor-canada/Data/ohe_unimputed_test.feather")