import pandas as pd
import os
from siuba import _, group_by, summarize, filter, select, mutate, arrange, count
import matplotlib.pyplot as plt
2 Data preprocessing and More Visualizations
2.1 Python: Preprocessing and Cleaning
2.1.1 Library Imports
2.1.2 Importing Data
= pd.read_csv('../../dssg-2025-mentor-canada/Data/Data_2020-Youth-Survey.csv') youth
2.1.3 Dropping Columns
# drop columns up to AO
= youth.iloc[:, 41:]
youth
# drop logic + validation columns
= youth.drop(columns=['QAge_Validation', 'Logic_QS1_6_Qtext', 'Logic_Qtext', 'QS1_8_Validation', 'Logic_QS1_26_Ask', 'QS1_29_Validation',
youth 'QS1_30_MValidatio', 'QS1_30_SMValidati', 'QS1_31_BWValidati', 'QS1_32_WValidatio', 'QS2_10_Validation', 'Logic_QS2_14_Ask','Logic_MENTORID1_1_1',
'Logic_MENTORID1_2_2', 'Logic_MENTORID1_3_3', 'Logic_AP_QS2_23', 'Logic_QS2_27_Ask', 'Logic_QS2_34_Valid', 'Logic_QS2_35_Ask', 'Logic_QS2_35_Mask1_1_1',
'Logic_QS2_35_Mask1_2_2', 'Logic_QS2_35_Mask1_3_3', 'Logic_QS2_35_Mask1_4_4', 'Logic_QS2_35_Mask1_5_5', 'Logic_QS2_35_Mask1_6_6', 'Logic_QS2_35_Mask1_7_7',
'Logic_QS2_35_Mask1_8_8', 'Logic_QS2_35_Mask1_9_9', 'Logic_QS2_35_Mask1_10_10', 'QS4_14_Validatio', 'QS4_15_Validatio', 'QS4_19_Validatio', 'QS4_23_Validatio'
])
# drop parent education columns (keeping first two parent education columns)
= youth.drop(youth.columns[102:120], axis=1) youth
2.1.4 Indicate Text Columns
= ['QS1_6_Other', 'QS1_9_Other', 'QS1_11_Other', 'QS1_16_Other', 'QS1_18_Other_1', 'QS1_18_Other_2',
text_columns 'QS1_22_Other', 'QS1_26_Other', 'QS1_27_Other', 'QS2_13_Other', 'QS2_14_MENTORID', 'QS2_14_MENTORID_2', 'QS2_14_MENTORID_3', 'QS2_18_LOCATION_1_O', 'QS2_15_RELATIONSHIP2', 'QS2_17_TYPE_2_Other', 'QS2_18_LOCATION_2_O', 'QS2_15_RELATIONSHIP3',
'QS2_17_TYPE_3_Other', 'QS2_18_LOCATION_3_O', 'QS2_25_YOUTHINIT2', 'QS2_27_MENTORPROGRA2', 'QS2_33_TRANSITIONS_Ot', 'QS2_34_SUPPORTS_Ot', 'QS2_38_NETGATIVEMENTO', 'QS3_2_TRANSITIONWITHOUTMEN', 'QS3_3_TRANSITIONSWITHOUTMENTO', 'QS4_4_Other', 'QS4_5_SATEDU_Other']
2.1.5 Encode Columns
2.1.5.1 Make column names interpretable for Q4
={
youth.rename(columns'QS1_6_ETHNOCULTURAL1_1_1': 'Race_SouthAsian',
'QS1_6_ETHNOCULTURAL1_2_2': 'Race_Chinese',
'QS1_6_ETHNOCULTURAL1_3_3': 'Race_Black',
'QS1_6_ETHNOCULTURAL1_4_4': 'Race_Filipino',
'QS1_6_ETHNOCULTURAL1_5_5': 'Race_LatinAmerica',
'QS1_6_ETHNOCULTURAL1_6_6': 'Race_Arab',
'QS1_6_ETHNOCULTURAL1_7_7': 'Race_SouthEastAsian',
'QS1_6_ETHNOCULTURAL1_8_8': 'Race_WestAsian',
'QS1_6_ETHNOCULTURAL1_9_9': 'Race_Korean',
'QS1_6_ETHNOCULTURAL1_10_10': 'Race_Japanese',
'QS1_6_ETHNOCULTURAL1_11_11': 'Race_White',
'QS1_6_ETHNOCULTURAL1_12_12': 'Race_Other',
'QS1_6_ETHNOCULTURAL1_13_13':'Race_Unsure',
'QS1_6_ETHNOCULTURAL1_14_14': 'Race_PreferNotToSay'
=True) }, inplace
3 Viualizations (continued)
3.1 Distribution of Age variable
3.2
Estimated yearly income distribution (15_yearly_income)
(Presented also in last week’s notebook)
Median as the central tendency is less susceptible to extreme outliers.
3.3 Income output and Transgender Identity
3.4
Early mentor experience and mental health rating outcome
3.5 
3.6 Teen mentor experience and mental health rating outcome
- Similar trend compared to those who had early mentor experience.
- We can also tap into whether those individuals who had EME also tend to continue their mentorship into TME.