library(reticulate)
library(readr)
library(tidyr)
library(knitr)
library(arrow)
library(tidymodels)
library(regclass)
library(xgboost)
set.seed(512) # Set seed to ensure code result reproducibility for randomization
6 Linear reg, and XGBoost
Week 4 (con’d) - 02
7 Linear model – Regression & Non-linear model – XGBoost
7.1 Data Import
<- read_feather('../../dssg-2025-mentor-canada/Data/ohe_unimputed_train.feather')
unimputed_train <- read_feather("../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.feather")
mean_mode_imputed_train
<- read_feather('../../dssg-2025-mentor-canada/Data/ohe_unimputed_train.feather')
unimputed_train <- read_feather("../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.feather")
mean_mode_imputed_train # knn_imputed_train <- read_csv('../../dssg-2025-mentor-canada/Data/faiss_knn_imputed_dataset.csv')
<- read_csv('../../dssg-2025-mentor-canada/Data/faiss_tuned_knn_imputed_dataset.csv') knn_imputed_train
7.2 Model fitting
7.2.1 Linear model - Linear Regression (Ordinary Least Square Reg)
<- lm(QS1_28_EMPLOYMENT_calculated ~ QS3_4_LIFEEVENTS1_11_11_X1 +
model +
QS3_4_LIFEEVENTS1_16_16_X1 +
QS3_4_LIFEEVENTS1_18_18_X1 +
QS2_3_PRESENCEOFM_Yes +
QS2_9_PRESENCEOFA_Yes +
QS2_6_MENTOREXPER
QS1_1_AGE , data = mean_mode_imputed_train)
summary(model)
Call:
lm(formula = QS1_28_EMPLOYMENT_calculated ~ QS3_4_LIFEEVENTS1_11_11_X1 +
QS3_4_LIFEEVENTS1_16_16_X1 + QS3_4_LIFEEVENTS1_18_18_X1 +
QS2_3_PRESENCEOFM_Yes + QS2_9_PRESENCEOFA_Yes + QS2_6_MENTOREXPER +
QS1_1_AGE, data = mean_mode_imputed_train)
Residuals:
Min 1Q Median 3Q Max
-232749 -67484 -37164 -15642 14499427
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 194862 141557 1.377 0.16881
QS3_4_LIFEEVENTS1_11_11_X1 -9546 36411 -0.262 0.79321
QS3_4_LIFEEVENTS1_16_16_X1 101558 33091 3.069 0.00218 **
QS3_4_LIFEEVENTS1_18_18_X1 -1370 37757 -0.036 0.97106
QS2_3_PRESENCEOFM_Yes 15939 32660 0.488 0.62558
QS2_9_PRESENCEOFA_Yes -11551 26758 -0.432 0.66603
QS2_6_MENTOREXPER -26100 28227 -0.925 0.35527
QS1_1_AGE 15246 11966 1.274 0.20279
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 521800 on 1907 degrees of freedom
Multiple R-squared: 0.007469, Adjusted R-squared: 0.003826
F-statistic: 2.05 on 7 and 1907 DF, p-value: 0.04584
<- glm(QS2_9_PRESENCEOFA_Yes ~ QS3_4_LIFEEVENTS1_11_11_X1 +
lg_model +
QS3_4_LIFEEVENTS1_16_16_X1 +
QS3_4_LIFEEVENTS1_18_18_X1 +
QS2_3_PRESENCEOFM_Yes
QS1_1_AGE, data = mean_mode_imputed_train,
family = binomial)
summary(lg_model)
Call:
glm(formula = QS2_9_PRESENCEOFA_Yes ~ QS3_4_LIFEEVENTS1_11_11_X1 +
QS3_4_LIFEEVENTS1_16_16_X1 + QS3_4_LIFEEVENTS1_18_18_X1 +
QS2_3_PRESENCEOFM_Yes + QS1_1_AGE, family = binomial, data = mean_mode_imputed_train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.06455 0.07208 -14.770 < 2e-16 ***
QS3_4_LIFEEVENTS1_11_11_X1 0.14958 0.15533 0.963 0.33558
QS3_4_LIFEEVENTS1_16_16_X1 0.37065 0.14003 2.647 0.00812 **
QS3_4_LIFEEVENTS1_18_18_X1 -0.30704 0.16228 -1.892 0.05849 .
QS2_3_PRESENCEOFM_Yes 1.81698 0.10417 17.443 < 2e-16 ***
QS1_1_AGE -0.03502 0.05096 -0.687 0.49195
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2616.8 on 1914 degrees of freedom
Residual deviance: 2263.5 on 1909 degrees of freedom
AIC: 2275.5
Number of Fisher Scoring iterations: 4
- Low R-squared suggests that the predictors, as specified, have limited practical/explanatory power for income.
7.2.2 Running linear regression KNN imputed data:
<- lm(QS1_28_EMPLOYMENT_calculated ~ QS3_4_LIFEEVENTS1_11_11_X1 +
model_knn_imputed +
QS3_4_LIFEEVENTS1_16_16_X1 +
QS3_4_LIFEEVENTS1_18_18_X1 +
QS2_3_PRESENCEOFM_Yes +
QS2_9_PRESENCEOFA_Yes +
QS2_6_MENTOREXPER
QS1_1_AGE , data = knn_imputed_train)
summary(model_knn_imputed)
Call:
lm(formula = QS1_28_EMPLOYMENT_calculated ~ QS3_4_LIFEEVENTS1_11_11_X1 +
QS3_4_LIFEEVENTS1_16_16_X1 + QS3_4_LIFEEVENTS1_18_18_X1 +
QS2_3_PRESENCEOFM_Yes + QS2_9_PRESENCEOFA_Yes + QS2_6_MENTOREXPER +
QS1_1_AGE, data = knn_imputed_train)
Residuals:
Min 1Q Median 3Q Max
-360130 -148233 -88360 77485 14415317
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 300527 131130 2.292 0.02202 *
QS3_4_LIFEEVENTS1_11_11_X1 12438 37022 0.336 0.73694
QS3_4_LIFEEVENTS1_16_16_X1 94411 33652 2.805 0.00508 **
QS3_4_LIFEEVENTS1_18_18_X1 9908 38405 0.258 0.79645
QS2_3_PRESENCEOFM_Yes 20112 28883 0.696 0.48630
QS2_9_PRESENCEOFA_Yes -15747 27735 -0.568 0.57025
QS2_6_MENTOREXPER -29268 28554 -1.025 0.30549
QS1_1_AGE -23854 12162 -1.961 0.04999 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 530600 on 1907 degrees of freedom
Multiple R-squared: 0.009167, Adjusted R-squared: 0.00553
F-statistic: 2.52 on 7 and 1907 DF, p-value: 0.01399
<- glm(QS2_9_PRESENCEOFA_Yes ~ QS3_4_LIFEEVENTS1_11_11_X1 +
lg_model_knn_imputed +
QS3_4_LIFEEVENTS1_16_16_X1 +
QS3_4_LIFEEVENTS1_18_18_X1 +
QS2_3_PRESENCEOFM_Yes
QS1_1_AGE, data = knn_imputed_train,
family = binomial)
summary(lg_model_knn_imputed)
Call:
glm(formula = QS2_9_PRESENCEOFA_Yes ~ QS3_4_LIFEEVENTS1_11_11_X1 +
QS3_4_LIFEEVENTS1_16_16_X1 + QS3_4_LIFEEVENTS1_18_18_X1 +
QS2_3_PRESENCEOFM_Yes + QS1_1_AGE, family = binomial, data = knn_imputed_train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.06455 0.07208 -14.770 < 2e-16 ***
QS3_4_LIFEEVENTS1_11_11_X1 0.14958 0.15533 0.963 0.33558
QS3_4_LIFEEVENTS1_16_16_X1 0.37065 0.14003 2.647 0.00812 **
QS3_4_LIFEEVENTS1_18_18_X1 -0.30704 0.16228 -1.892 0.05849 .
QS2_3_PRESENCEOFM_Yes 1.81698 0.10417 17.443 < 2e-16 ***
QS1_1_AGE -0.03502 0.05096 -0.687 0.49195
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2616.8 on 1914 degrees of freedom
Residual deviance: 2263.5 on 1909 degrees of freedom
AIC: 2275.5
Number of Fisher Scoring iterations: 4
7.2.2.1 Multicollinearity:
VIF(model)
QS3_4_LIFEEVENTS1_11_11_X1 QS3_4_LIFEEVENTS1_16_16_X1
1.297750 1.165585
QS3_4_LIFEEVENTS1_18_18_X1 QS2_3_PRESENCEOFM_Yes
1.321109 1.779627
QS2_9_PRESENCEOFA_Yes QS2_6_MENTOREXPER
1.234112 1.504845
QS1_1_AGE
1.006562
7.3 Mentor Influence on Annual Income
<- lm(QS1_28_EMPLOYMENT_calculated ~ QS2_36_INFLUENCE1_1_1 +
model_mentors_influence +
QS2_36_INFLUENCE1_2_2 +
QS2_36_INFLUENCE1_3_3 +
QS2_36_INFLUENCE1_4_4 +
QS2_36_INFLUENCE1_5_5 +
QS2_36_INFLUENCE1_6_6 +
QS2_36_INFLUENCE1_7_7 +
QS2_36_INFLUENCE1_8_8 + QS1_1_AGE,
QS2_36_INFLUENCE1_9_9
data = mean_mode_imputed_train)
summary(model_mentors_influence)
Call:
lm(formula = QS1_28_EMPLOYMENT_calculated ~ QS2_36_INFLUENCE1_1_1 +
QS2_36_INFLUENCE1_2_2 + QS2_36_INFLUENCE1_3_3 + QS2_36_INFLUENCE1_4_4 +
QS2_36_INFLUENCE1_5_5 + QS2_36_INFLUENCE1_6_6 + QS2_36_INFLUENCE1_7_7 +
QS2_36_INFLUENCE1_8_8 + QS2_36_INFLUENCE1_9_9 + QS1_1_AGE,
data = mean_mode_imputed_train)
Residuals:
Min 1Q Median 3Q Max
-201802 -73920 -53003 -30317 14572589
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2930.3 81687.8 -0.036 0.971
QS2_36_INFLUENCE1_1_1 5497.6 20300.2 0.271 0.787
QS2_36_INFLUENCE1_2_2 28136.6 18942.8 1.485 0.138
QS2_36_INFLUENCE1_3_3 14195.6 20573.8 0.690 0.490
QS2_36_INFLUENCE1_4_4 -18035.5 20577.5 -0.876 0.381
QS2_36_INFLUENCE1_5_5 -29424.9 25155.4 -1.170 0.242
QS2_36_INFLUENCE1_6_6 -9623.3 25983.7 -0.370 0.711
QS2_36_INFLUENCE1_7_7 -175.4 24750.1 -0.007 0.994
QS2_36_INFLUENCE1_8_8 22186.0 24978.3 0.888 0.375
QS2_36_INFLUENCE1_9_9 21026.0 21434.9 0.981 0.327
QS1_1_AGE 14898.5 12013.4 1.240 0.215
Residual standard error: 523000 on 1904 degrees of freedom
Multiple R-squared: 0.00439, Adjusted R-squared: -0.0008392
F-statistic: 0.8395 on 10 and 1904 DF, p-value: 0.5904
<- lm(QS1_28_EMPLOYMENT_calculated ~ QS2_36_INFLUENCE1_1_1 +
model_mentors_influence_knn_imputed +
QS2_36_INFLUENCE1_2_2 +
QS2_36_INFLUENCE1_3_3 +
QS2_36_INFLUENCE1_4_4 +
QS2_36_INFLUENCE1_5_5 +
QS2_36_INFLUENCE1_6_6 +
QS2_36_INFLUENCE1_7_7 +
QS2_36_INFLUENCE1_8_8 + QS1_1_AGE,
QS2_36_INFLUENCE1_9_9
data = knn_imputed_train)
summary(model_mentors_influence_knn_imputed)
Call:
lm(formula = QS1_28_EMPLOYMENT_calculated ~ QS2_36_INFLUENCE1_1_1 +
QS2_36_INFLUENCE1_2_2 + QS2_36_INFLUENCE1_3_3 + QS2_36_INFLUENCE1_4_4 +
QS2_36_INFLUENCE1_5_5 + QS2_36_INFLUENCE1_6_6 + QS2_36_INFLUENCE1_7_7 +
QS2_36_INFLUENCE1_8_8 + QS2_36_INFLUENCE1_9_9 + QS1_1_AGE,
data = knn_imputed_train)
Residuals:
Min 1Q Median 3Q Max
-335602 -155671 -95507 71537 14458352
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 54826.2 83700.4 0.655 0.5125
QS2_36_INFLUENCE1_1_1 21693.6 22329.7 0.972 0.3314
QS2_36_INFLUENCE1_2_2 10472.9 20762.3 0.504 0.6140
QS2_36_INFLUENCE1_3_3 12448.6 22999.1 0.541 0.5884
QS2_36_INFLUENCE1_4_4 5443.6 23194.6 0.235 0.8145
QS2_36_INFLUENCE1_5_5 -41326.6 25327.1 -1.632 0.1029
QS2_36_INFLUENCE1_6_6 -11046.9 27091.7 -0.408 0.6835
QS2_36_INFLUENCE1_7_7 -505.5 27328.5 -0.018 0.9852
QS2_36_INFLUENCE1_8_8 21914.8 25380.3 0.863 0.3880
QS2_36_INFLUENCE1_9_9 28699.2 23535.2 1.219 0.2228
QS1_1_AGE -23661.1 12208.2 -1.938 0.0528 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 531600 on 1904 degrees of freedom
Multiple R-squared: 0.007003, Adjusted R-squared: 0.001787
F-statistic: 1.343 on 10 and 1904 DF, p-value: 0.2017
7.4 Model specification (Lasso regression with L1 regularization with mixture = 1
)
(Workflow with tidymodels–for now has echo: false)
7.4.0.1 Define cross-validation (CV) folds for tuning & grid:
7.4.1 Tidymodels
Workflow
7.4.2 Tuning Regularization Penalty
7.5 XGB
7.5.1 XGB model 1:
7.5.1.1 All 3 SES indicators, mentor experiences, age, and year of higher education completion.
(Mean mode imputed):
<- model.matrix(~ QS3_4_LIFEEVENTS1_16_16_X1 + QS2_3_PRESENCEOFM_Yes +
x + QS2_6_MENTOREXPER +
QS2_9_PRESENCEOFA_Yes + QS1_23_YEARCOMPLE,
QS1_1_AGE data = mean_mode_imputed_train)[, -1]
<- log(mean_mode_imputed_train$QS1_28_EMPLOYMENT_calculated + 1)
y <- xgb.DMatrix(x, label = y)
dmatrix <- xgboost(data = dmatrix, nrounds = 100, objective = "reg:squarederror",
xgb_model params = list(max_depth = 4, eta = 0.1))
xgb.importance(model = xgb_model)
xgb.plot.importance(xgb.importance(model = xgb_model))
<- xgb.importance(model = xgb_model)
xgb_importance_df
<- xgb.cv(params = list(max_depth = 4, eta = 0.1), data = dmatrix, nrounds = 200, nfold = 5) xgb_cv
xgb.plot.importance(xgb.importance(model = xgb_model))
kable(xgb_importance_df)
Feature | Gain | Cover | Frequency |
---|---|---|---|
QS1_23_YEARCOMPLE | 0.3208173 | 0.5076989 | 0.3370697 |
QS1_1_AGE | 0.2958916 | 0.2551412 | 0.2754203 |
QS2_9_PRESENCEOFA_Yes | 0.1522012 | 0.0451389 | 0.0960769 |
QS2_6_MENTOREXPER | 0.0904326 | 0.1172270 | 0.1337070 |
QS3_4_LIFEEVENTS1_16_16_X1 | 0.0827873 | 0.0463520 | 0.0848679 |
QS2_3_PRESENCEOFM_Yes | 0.0578700 | 0.0284421 | 0.0728583 |
(KNN imputed):
<- model.matrix(~ QS3_4_LIFEEVENTS1_16_16_X1 + QS2_3_PRESENCEOFM_Yes +
x_knn_imputed + QS2_6_MENTOREXPER +
QS2_9_PRESENCEOFA_Yes + QS1_23_YEARCOMPLE,
QS1_1_AGE data = knn_imputed_train)[, -1]
<- log(knn_imputed_train$QS1_28_EMPLOYMENT_calculated + 1)
y_knn_imputed <- xgb.DMatrix(x_knn_imputed, label = y_knn_imputed)
dmatrix_knn_imputed <- xgboost(data = dmatrix_knn_imputed, nrounds = 100, objective = "reg:squarederror",
xgb_model_knn_imputed params = list(max_depth = 4, eta = 0.1))
xgb.importance(model = xgb_model_knn_imputed)
xgb.plot.importance(xgb.importance(model = xgb_model_knn_imputed))
<- xgb.importance(model = xgb_model_knn_imputed)
xgb_importance_knn_imputed_df
<- xgb.cv(params = list(max_depth = 4, eta = 0.1), data = dmatrix, nrounds = 200, nfold = 5) xgb_cv_knn_imputed
xgb.plot.importance(xgb.importance(model = xgb_model_knn_imputed))
kable(xgb_importance_knn_imputed_df)
Feature | Gain | Cover | Frequency |
---|---|---|---|
QS2_6_MENTOREXPER | 0.4472706 | 0.4884201 | 0.4256536 |
QS1_23_YEARCOMPLE | 0.3109016 | 0.3084236 | 0.2810458 |
QS1_1_AGE | 0.1478528 | 0.1727227 | 0.1870915 |
QS3_4_LIFEEVENTS1_16_16_X1 | 0.0518309 | 0.0152118 | 0.0604575 |
QS2_3_PRESENCEOFM_Yes | 0.0261236 | 0.0025033 | 0.0228758 |
QS2_9_PRESENCEOFA_Yes | 0.0160206 | 0.0127185 | 0.0228758 |
7.5.2 XGB model 2:
7.5.2.1 All 3 SES indicators, mentor experiences, and age.
<- model.matrix(~ QS3_4_LIFEEVENTS1_11_11_X1 + QS3_4_LIFEEVENTS1_16_16_X1 + QS3_4_LIFEEVENTS1_18_18_X1 + QS2_3_PRESENCEOFM_Yes + QS2_9_PRESENCEOFA_Yes + QS2_6_MENTOREXPER + QS1_1_AGE, data = mean_mode_imputed_train)[, -1]
x
<- log(mean_mode_imputed_train$QS1_28_EMPLOYMENT_calculated + 1)
y <- xgb.DMatrix(x, label = y)
dmatrix <- xgboost(data = dmatrix, nrounds = 100, objective = "reg:squarederror",
xgb_model params = list(max_depth = 4, eta = 0.1))
<- xgb.importance(model = xgb_model)
xgb_importance_df
<- xgb.cv(params = list(max_depth = 4, eta = 0.1), data = dmatrix, nrounds = 200, nfold = 5) xgb_cv
xgb.plot.importance(xgb.importance(model = xgb_model))
write_csv(xgb_importance_df, "outputs/tables/week-04/02-xgb-importance.csv")
kable(xgb_importance_df)
Feature | Gain | Cover | Frequency |
---|---|---|---|
QS1_1_AGE | 0.3465507 | 0.3568841 | 0.3087607 |
QS3_4_LIFEEVENTS1_18_18_X1 | 0.1640453 | 0.0496456 | 0.1143162 |
QS2_6_MENTOREXPER | 0.1523684 | 0.3251911 | 0.2029915 |
QS3_4_LIFEEVENTS1_16_16_X1 | 0.1112911 | 0.0581188 | 0.0950855 |
QS2_9_PRESENCEOFA_Yes | 0.1045214 | 0.0778608 | 0.1100427 |
QS3_4_LIFEEVENTS1_11_11_X1 | 0.0607111 | 0.0737528 | 0.1036325 |
QS2_3_PRESENCEOFM_Yes | 0.0605121 | 0.0585468 | 0.0651709 |
(KNN imputed):
<- model.matrix(~ QS3_4_LIFEEVENTS1_11_11_X1 + QS3_4_LIFEEVENTS1_16_16_X1 + QS3_4_LIFEEVENTS1_18_18_X1 + QS2_3_PRESENCEOFM_Yes + QS2_9_PRESENCEOFA_Yes + QS2_6_MENTOREXPER + QS1_1_AGE,
x_knn_imputed data = knn_imputed_train)[, -1]
<- log(knn_imputed_train$QS1_28_EMPLOYMENT_calculated + 1)
y_knn_imputed <- xgb.DMatrix(x_knn_imputed, label = y_knn_imputed)
dmatrix_knn_imputed <- xgboost(data = dmatrix_knn_imputed, nrounds = 100, objective = "reg:squarederror",
xgb_model_knn_imputed params = list(max_depth = 4, eta = 0.1))
xgb.importance(model = xgb_model_knn_imputed)
xgb.plot.importance(xgb.importance(model = xgb_model_knn_imputed))
<- xgb.importance(model = xgb_model_knn_imputed)
xgb_importance_knn_imputed_df
<- xgb.cv(params = list(max_depth = 4, eta = 0.1), data = dmatrix, nrounds = 200, nfold = 5) xgb_cv_knn_imputed
xgb.plot.importance(xgb.importance(model = xgb_model_knn_imputed))
kable(xgb_importance_knn_imputed_df)
Feature | Gain | Cover | Frequency |
---|---|---|---|
QS2_6_MENTOREXPER | 0.6497528 | 0.6784380 | 0.5716963 |
QS1_1_AGE | 0.1715591 | 0.2227154 | 0.2071228 |
QS3_4_LIFEEVENTS1_18_18_X1 | 0.0613338 | 0.0248077 | 0.0562324 |
QS3_4_LIFEEVENTS1_11_11_X1 | 0.0432909 | 0.0207085 | 0.0515464 |
QS3_4_LIFEEVENTS1_16_16_X1 | 0.0331486 | 0.0213620 | 0.0627929 |
QS2_9_PRESENCEOFA_Yes | 0.0324314 | 0.0284243 | 0.0393627 |
QS2_3_PRESENCEOFM_Yes | 0.0084833 | 0.0035442 | 0.0112465 |