6  Linear reg, and XGBoost

Week 4 (con’d) - 02

Published

June 9, 2025

7 Linear model – Regression & Non-linear model – XGBoost

library(reticulate)
library(readr)
library(tidyr)
library(knitr)
library(arrow)
library(tidymodels)
library(regclass)
library(xgboost)
set.seed(512) # Set seed to ensure code result reproducibility for randomization

7.1 Data Import

unimputed_train <- read_feather('../../dssg-2025-mentor-canada/Data/ohe_unimputed_train.feather')
mean_mode_imputed_train <- read_feather("../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.feather")

unimputed_train <- read_feather('../../dssg-2025-mentor-canada/Data/ohe_unimputed_train.feather')
mean_mode_imputed_train <- read_feather("../../dssg-2025-mentor-canada/Data/mean_mode_imputed_train.feather")
# knn_imputed_train <- read_csv('../../dssg-2025-mentor-canada/Data/faiss_knn_imputed_dataset.csv')
knn_imputed_train <- read_csv('../../dssg-2025-mentor-canada/Data/faiss_tuned_knn_imputed_dataset.csv')

7.2 Model fitting

7.2.1 Linear model - Linear Regression (Ordinary Least Square Reg)

model <- lm(QS1_28_EMPLOYMENT_calculated ~  QS3_4_LIFEEVENTS1_11_11_X1 + 
                                            QS3_4_LIFEEVENTS1_16_16_X1 + 
                                            QS3_4_LIFEEVENTS1_18_18_X1 + 
                                            QS2_3_PRESENCEOFM_Yes + 
                                            QS2_9_PRESENCEOFA_Yes + 
                                            QS2_6_MENTOREXPER + 
                                            QS1_1_AGE , 
            data = mean_mode_imputed_train)
summary(model)

Call:
lm(formula = QS1_28_EMPLOYMENT_calculated ~ QS3_4_LIFEEVENTS1_11_11_X1 + 
    QS3_4_LIFEEVENTS1_16_16_X1 + QS3_4_LIFEEVENTS1_18_18_X1 + 
    QS2_3_PRESENCEOFM_Yes + QS2_9_PRESENCEOFA_Yes + QS2_6_MENTOREXPER + 
    QS1_1_AGE, data = mean_mode_imputed_train)

Residuals:
     Min       1Q   Median       3Q      Max 
 -232749   -67484   -37164   -15642 14499427 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)   
(Intercept)                  194862     141557   1.377  0.16881   
QS3_4_LIFEEVENTS1_11_11_X1    -9546      36411  -0.262  0.79321   
QS3_4_LIFEEVENTS1_16_16_X1   101558      33091   3.069  0.00218 **
QS3_4_LIFEEVENTS1_18_18_X1    -1370      37757  -0.036  0.97106   
QS2_3_PRESENCEOFM_Yes         15939      32660   0.488  0.62558   
QS2_9_PRESENCEOFA_Yes        -11551      26758  -0.432  0.66603   
QS2_6_MENTOREXPER            -26100      28227  -0.925  0.35527   
QS1_1_AGE                     15246      11966   1.274  0.20279   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 521800 on 1907 degrees of freedom
Multiple R-squared:  0.007469,  Adjusted R-squared:  0.003826 
F-statistic:  2.05 on 7 and 1907 DF,  p-value: 0.04584
lg_model <- glm(QS2_9_PRESENCEOFA_Yes ~  QS3_4_LIFEEVENTS1_11_11_X1 + 
                                         QS3_4_LIFEEVENTS1_16_16_X1 + 
                                         QS3_4_LIFEEVENTS1_18_18_X1 + 
                                         QS2_3_PRESENCEOFM_Yes + 
                                         QS1_1_AGE, 
            data = mean_mode_imputed_train,
            family = binomial)
summary(lg_model)

Call:
glm(formula = QS2_9_PRESENCEOFA_Yes ~ QS3_4_LIFEEVENTS1_11_11_X1 + 
    QS3_4_LIFEEVENTS1_16_16_X1 + QS3_4_LIFEEVENTS1_18_18_X1 + 
    QS2_3_PRESENCEOFM_Yes + QS1_1_AGE, family = binomial, data = mean_mode_imputed_train)

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                -1.06455    0.07208 -14.770  < 2e-16 ***
QS3_4_LIFEEVENTS1_11_11_X1  0.14958    0.15533   0.963  0.33558    
QS3_4_LIFEEVENTS1_16_16_X1  0.37065    0.14003   2.647  0.00812 ** 
QS3_4_LIFEEVENTS1_18_18_X1 -0.30704    0.16228  -1.892  0.05849 .  
QS2_3_PRESENCEOFM_Yes       1.81698    0.10417  17.443  < 2e-16 ***
QS1_1_AGE                  -0.03502    0.05096  -0.687  0.49195    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2616.8  on 1914  degrees of freedom
Residual deviance: 2263.5  on 1909  degrees of freedom
AIC: 2275.5

Number of Fisher Scoring iterations: 4
  • Low R-squared suggests that the predictors, as specified, have limited practical/explanatory power for income.

7.2.2 Running linear regression KNN imputed data:

model_knn_imputed <- lm(QS1_28_EMPLOYMENT_calculated ~  QS3_4_LIFEEVENTS1_11_11_X1 + 
                                            QS3_4_LIFEEVENTS1_16_16_X1 + 
                                            QS3_4_LIFEEVENTS1_18_18_X1 + 
                                            QS2_3_PRESENCEOFM_Yes + 
                                            QS2_9_PRESENCEOFA_Yes + 
                                            QS2_6_MENTOREXPER + 
                                            QS1_1_AGE , 
            data = knn_imputed_train)
summary(model_knn_imputed)

Call:
lm(formula = QS1_28_EMPLOYMENT_calculated ~ QS3_4_LIFEEVENTS1_11_11_X1 + 
    QS3_4_LIFEEVENTS1_16_16_X1 + QS3_4_LIFEEVENTS1_18_18_X1 + 
    QS2_3_PRESENCEOFM_Yes + QS2_9_PRESENCEOFA_Yes + QS2_6_MENTOREXPER + 
    QS1_1_AGE, data = knn_imputed_train)

Residuals:
     Min       1Q   Median       3Q      Max 
 -360130  -148233   -88360    77485 14415317 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)   
(Intercept)                  300527     131130   2.292  0.02202 * 
QS3_4_LIFEEVENTS1_11_11_X1    12438      37022   0.336  0.73694   
QS3_4_LIFEEVENTS1_16_16_X1    94411      33652   2.805  0.00508 **
QS3_4_LIFEEVENTS1_18_18_X1     9908      38405   0.258  0.79645   
QS2_3_PRESENCEOFM_Yes         20112      28883   0.696  0.48630   
QS2_9_PRESENCEOFA_Yes        -15747      27735  -0.568  0.57025   
QS2_6_MENTOREXPER            -29268      28554  -1.025  0.30549   
QS1_1_AGE                    -23854      12162  -1.961  0.04999 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 530600 on 1907 degrees of freedom
Multiple R-squared:  0.009167,  Adjusted R-squared:  0.00553 
F-statistic:  2.52 on 7 and 1907 DF,  p-value: 0.01399
lg_model_knn_imputed <- glm(QS2_9_PRESENCEOFA_Yes ~  QS3_4_LIFEEVENTS1_11_11_X1 + 
                                         QS3_4_LIFEEVENTS1_16_16_X1 + 
                                         QS3_4_LIFEEVENTS1_18_18_X1 + 
                                         QS2_3_PRESENCEOFM_Yes + 
                                         QS1_1_AGE, 
            data = knn_imputed_train,
            family = binomial)
summary(lg_model_knn_imputed)

Call:
glm(formula = QS2_9_PRESENCEOFA_Yes ~ QS3_4_LIFEEVENTS1_11_11_X1 + 
    QS3_4_LIFEEVENTS1_16_16_X1 + QS3_4_LIFEEVENTS1_18_18_X1 + 
    QS2_3_PRESENCEOFM_Yes + QS1_1_AGE, family = binomial, data = knn_imputed_train)

Coefficients:
                           Estimate Std. Error z value Pr(>|z|)    
(Intercept)                -1.06455    0.07208 -14.770  < 2e-16 ***
QS3_4_LIFEEVENTS1_11_11_X1  0.14958    0.15533   0.963  0.33558    
QS3_4_LIFEEVENTS1_16_16_X1  0.37065    0.14003   2.647  0.00812 ** 
QS3_4_LIFEEVENTS1_18_18_X1 -0.30704    0.16228  -1.892  0.05849 .  
QS2_3_PRESENCEOFM_Yes       1.81698    0.10417  17.443  < 2e-16 ***
QS1_1_AGE                  -0.03502    0.05096  -0.687  0.49195    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2616.8  on 1914  degrees of freedom
Residual deviance: 2263.5  on 1909  degrees of freedom
AIC: 2275.5

Number of Fisher Scoring iterations: 4

7.2.2.1 Multicollinearity:

VIF(model)
QS3_4_LIFEEVENTS1_11_11_X1 QS3_4_LIFEEVENTS1_16_16_X1 
                  1.297750                   1.165585 
QS3_4_LIFEEVENTS1_18_18_X1      QS2_3_PRESENCEOFM_Yes 
                  1.321109                   1.779627 
     QS2_9_PRESENCEOFA_Yes          QS2_6_MENTOREXPER 
                  1.234112                   1.504845 
                 QS1_1_AGE 
                  1.006562 

7.3 Mentor Influence on Annual Income

model_mentors_influence <- lm(QS1_28_EMPLOYMENT_calculated ~  QS2_36_INFLUENCE1_1_1 + 
                                                              QS2_36_INFLUENCE1_2_2 + 
                                                              QS2_36_INFLUENCE1_3_3 + 
                                                              QS2_36_INFLUENCE1_4_4 + 
                                                              QS2_36_INFLUENCE1_5_5 + 
                                                              QS2_36_INFLUENCE1_6_6 + 
                                                              QS2_36_INFLUENCE1_7_7 + 
                                                              QS2_36_INFLUENCE1_8_8 + 
                                                              QS2_36_INFLUENCE1_9_9 + QS1_1_AGE, 

                                data = mean_mode_imputed_train)
summary(model_mentors_influence)

Call:
lm(formula = QS1_28_EMPLOYMENT_calculated ~ QS2_36_INFLUENCE1_1_1 + 
    QS2_36_INFLUENCE1_2_2 + QS2_36_INFLUENCE1_3_3 + QS2_36_INFLUENCE1_4_4 + 
    QS2_36_INFLUENCE1_5_5 + QS2_36_INFLUENCE1_6_6 + QS2_36_INFLUENCE1_7_7 + 
    QS2_36_INFLUENCE1_8_8 + QS2_36_INFLUENCE1_9_9 + QS1_1_AGE, 
    data = mean_mode_imputed_train)

Residuals:
     Min       1Q   Median       3Q      Max 
 -201802   -73920   -53003   -30317 14572589 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)
(Intercept)            -2930.3    81687.8  -0.036    0.971
QS2_36_INFLUENCE1_1_1   5497.6    20300.2   0.271    0.787
QS2_36_INFLUENCE1_2_2  28136.6    18942.8   1.485    0.138
QS2_36_INFLUENCE1_3_3  14195.6    20573.8   0.690    0.490
QS2_36_INFLUENCE1_4_4 -18035.5    20577.5  -0.876    0.381
QS2_36_INFLUENCE1_5_5 -29424.9    25155.4  -1.170    0.242
QS2_36_INFLUENCE1_6_6  -9623.3    25983.7  -0.370    0.711
QS2_36_INFLUENCE1_7_7   -175.4    24750.1  -0.007    0.994
QS2_36_INFLUENCE1_8_8  22186.0    24978.3   0.888    0.375
QS2_36_INFLUENCE1_9_9  21026.0    21434.9   0.981    0.327
QS1_1_AGE              14898.5    12013.4   1.240    0.215

Residual standard error: 523000 on 1904 degrees of freedom
Multiple R-squared:  0.00439,   Adjusted R-squared:  -0.0008392 
F-statistic: 0.8395 on 10 and 1904 DF,  p-value: 0.5904
model_mentors_influence_knn_imputed <- lm(QS1_28_EMPLOYMENT_calculated ~  QS2_36_INFLUENCE1_1_1 + 
                                                              QS2_36_INFLUENCE1_2_2 + 
                                                              QS2_36_INFLUENCE1_3_3 + 
                                                              QS2_36_INFLUENCE1_4_4 + 
                                                              QS2_36_INFLUENCE1_5_5 + 
                                                              QS2_36_INFLUENCE1_6_6 + 
                                                              QS2_36_INFLUENCE1_7_7 + 
                                                              QS2_36_INFLUENCE1_8_8 + 
                                                              QS2_36_INFLUENCE1_9_9 + QS1_1_AGE, 

                                data = knn_imputed_train)
summary(model_mentors_influence_knn_imputed)

Call:
lm(formula = QS1_28_EMPLOYMENT_calculated ~ QS2_36_INFLUENCE1_1_1 + 
    QS2_36_INFLUENCE1_2_2 + QS2_36_INFLUENCE1_3_3 + QS2_36_INFLUENCE1_4_4 + 
    QS2_36_INFLUENCE1_5_5 + QS2_36_INFLUENCE1_6_6 + QS2_36_INFLUENCE1_7_7 + 
    QS2_36_INFLUENCE1_8_8 + QS2_36_INFLUENCE1_9_9 + QS1_1_AGE, 
    data = knn_imputed_train)

Residuals:
     Min       1Q   Median       3Q      Max 
 -335602  -155671   -95507    71537 14458352 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)  
(Intercept)            54826.2    83700.4   0.655   0.5125  
QS2_36_INFLUENCE1_1_1  21693.6    22329.7   0.972   0.3314  
QS2_36_INFLUENCE1_2_2  10472.9    20762.3   0.504   0.6140  
QS2_36_INFLUENCE1_3_3  12448.6    22999.1   0.541   0.5884  
QS2_36_INFLUENCE1_4_4   5443.6    23194.6   0.235   0.8145  
QS2_36_INFLUENCE1_5_5 -41326.6    25327.1  -1.632   0.1029  
QS2_36_INFLUENCE1_6_6 -11046.9    27091.7  -0.408   0.6835  
QS2_36_INFLUENCE1_7_7   -505.5    27328.5  -0.018   0.9852  
QS2_36_INFLUENCE1_8_8  21914.8    25380.3   0.863   0.3880  
QS2_36_INFLUENCE1_9_9  28699.2    23535.2   1.219   0.2228  
QS1_1_AGE             -23661.1    12208.2  -1.938   0.0528 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 531600 on 1904 degrees of freedom
Multiple R-squared:  0.007003,  Adjusted R-squared:  0.001787 
F-statistic: 1.343 on 10 and 1904 DF,  p-value: 0.2017

7.4 Model specification (Lasso regression with L1 regularization with mixture = 1)

(Workflow with tidymodels–for now has echo: false)

7.4.0.1 Define cross-validation (CV) folds for tuning & grid:

7.4.1 Tidymodels Workflow

7.4.2 Tuning Regularization Penalty

7.5 XGB

7.5.1 XGB model 1:

7.5.1.1 All 3 SES indicators, mentor experiences, age, and year of higher education completion.

(Mean mode imputed):

x <- model.matrix(~ QS3_4_LIFEEVENTS1_16_16_X1 + QS2_3_PRESENCEOFM_Yes + 
                  QS2_9_PRESENCEOFA_Yes + QS2_6_MENTOREXPER + 
                  QS1_1_AGE + QS1_23_YEARCOMPLE, 
                  data = mean_mode_imputed_train)[, -1]
y <- log(mean_mode_imputed_train$QS1_28_EMPLOYMENT_calculated + 1)
dmatrix <- xgb.DMatrix(x, label = y)
xgb_model <- xgboost(data = dmatrix, nrounds = 100, objective = "reg:squarederror", 
                     params = list(max_depth = 4, eta = 0.1))
xgb.importance(model = xgb_model)

xgb.plot.importance(xgb.importance(model = xgb_model))
xgb_importance_df <- xgb.importance(model = xgb_model)

xgb_cv <- xgb.cv(params = list(max_depth = 4, eta = 0.1), data = dmatrix, nrounds = 200, nfold = 5)
xgb.plot.importance(xgb.importance(model = xgb_model))

kable(xgb_importance_df)
Feature Gain Cover Frequency
QS1_23_YEARCOMPLE 0.3208173 0.5076989 0.3370697
QS1_1_AGE 0.2958916 0.2551412 0.2754203
QS2_9_PRESENCEOFA_Yes 0.1522012 0.0451389 0.0960769
QS2_6_MENTOREXPER 0.0904326 0.1172270 0.1337070
QS3_4_LIFEEVENTS1_16_16_X1 0.0827873 0.0463520 0.0848679
QS2_3_PRESENCEOFM_Yes 0.0578700 0.0284421 0.0728583

(KNN imputed):

x_knn_imputed <- model.matrix(~ QS3_4_LIFEEVENTS1_16_16_X1 + QS2_3_PRESENCEOFM_Yes + 
                  QS2_9_PRESENCEOFA_Yes + QS2_6_MENTOREXPER + 
                  QS1_1_AGE + QS1_23_YEARCOMPLE, 
                  data = knn_imputed_train)[, -1]
y_knn_imputed <- log(knn_imputed_train$QS1_28_EMPLOYMENT_calculated + 1)
dmatrix_knn_imputed <- xgb.DMatrix(x_knn_imputed, label = y_knn_imputed)
xgb_model_knn_imputed <- xgboost(data = dmatrix_knn_imputed, nrounds = 100, objective = "reg:squarederror", 
                     params = list(max_depth = 4, eta = 0.1))
xgb.importance(model = xgb_model_knn_imputed)

xgb.plot.importance(xgb.importance(model = xgb_model_knn_imputed))
xgb_importance_knn_imputed_df <- xgb.importance(model = xgb_model_knn_imputed)

xgb_cv_knn_imputed <- xgb.cv(params = list(max_depth = 4, eta = 0.1), data = dmatrix, nrounds = 200, nfold = 5)
xgb.plot.importance(xgb.importance(model = xgb_model_knn_imputed))

kable(xgb_importance_knn_imputed_df)
Feature Gain Cover Frequency
QS2_6_MENTOREXPER 0.4472706 0.4884201 0.4256536
QS1_23_YEARCOMPLE 0.3109016 0.3084236 0.2810458
QS1_1_AGE 0.1478528 0.1727227 0.1870915
QS3_4_LIFEEVENTS1_16_16_X1 0.0518309 0.0152118 0.0604575
QS2_3_PRESENCEOFM_Yes 0.0261236 0.0025033 0.0228758
QS2_9_PRESENCEOFA_Yes 0.0160206 0.0127185 0.0228758

7.5.2 XGB model 2:

7.5.2.1 All 3 SES indicators, mentor experiences, and age.

x <- model.matrix(~ QS3_4_LIFEEVENTS1_11_11_X1 + QS3_4_LIFEEVENTS1_16_16_X1 +  QS3_4_LIFEEVENTS1_18_18_X1 + QS2_3_PRESENCEOFM_Yes + QS2_9_PRESENCEOFA_Yes + QS2_6_MENTOREXPER + QS1_1_AGE, data = mean_mode_imputed_train)[, -1]

y <- log(mean_mode_imputed_train$QS1_28_EMPLOYMENT_calculated + 1)
dmatrix <- xgb.DMatrix(x, label = y)
xgb_model <- xgboost(data = dmatrix, nrounds = 100, objective = "reg:squarederror", 
params = list(max_depth = 4, eta = 0.1))

xgb_importance_df <- xgb.importance(model = xgb_model)

xgb_cv <- xgb.cv(params = list(max_depth = 4, eta = 0.1), data = dmatrix, nrounds = 200, nfold = 5)
xgb.plot.importance(xgb.importance(model = xgb_model))

write_csv(xgb_importance_df, "outputs/tables/week-04/02-xgb-importance.csv")
kable(xgb_importance_df)
Feature Gain Cover Frequency
QS1_1_AGE 0.3465507 0.3568841 0.3087607
QS3_4_LIFEEVENTS1_18_18_X1 0.1640453 0.0496456 0.1143162
QS2_6_MENTOREXPER 0.1523684 0.3251911 0.2029915
QS3_4_LIFEEVENTS1_16_16_X1 0.1112911 0.0581188 0.0950855
QS2_9_PRESENCEOFA_Yes 0.1045214 0.0778608 0.1100427
QS3_4_LIFEEVENTS1_11_11_X1 0.0607111 0.0737528 0.1036325
QS2_3_PRESENCEOFM_Yes 0.0605121 0.0585468 0.0651709

(KNN imputed):

x_knn_imputed <- model.matrix(~ QS3_4_LIFEEVENTS1_11_11_X1 + QS3_4_LIFEEVENTS1_16_16_X1 +  QS3_4_LIFEEVENTS1_18_18_X1 + QS2_3_PRESENCEOFM_Yes + QS2_9_PRESENCEOFA_Yes + QS2_6_MENTOREXPER + QS1_1_AGE, 
                  data = knn_imputed_train)[, -1]
y_knn_imputed <- log(knn_imputed_train$QS1_28_EMPLOYMENT_calculated + 1)
dmatrix_knn_imputed <- xgb.DMatrix(x_knn_imputed, label = y_knn_imputed)
xgb_model_knn_imputed <- xgboost(data = dmatrix_knn_imputed, nrounds = 100, objective = "reg:squarederror", 
                     params = list(max_depth = 4, eta = 0.1))
xgb.importance(model = xgb_model_knn_imputed)

xgb.plot.importance(xgb.importance(model = xgb_model_knn_imputed))
xgb_importance_knn_imputed_df <- xgb.importance(model = xgb_model_knn_imputed)

xgb_cv_knn_imputed <- xgb.cv(params = list(max_depth = 4, eta = 0.1), data = dmatrix, nrounds = 200, nfold = 5)
xgb.plot.importance(xgb.importance(model = xgb_model_knn_imputed))

kable(xgb_importance_knn_imputed_df)
Feature Gain Cover Frequency
QS2_6_MENTOREXPER 0.6497528 0.6784380 0.5716963
QS1_1_AGE 0.1715591 0.2227154 0.2071228
QS3_4_LIFEEVENTS1_18_18_X1 0.0613338 0.0248077 0.0562324
QS3_4_LIFEEVENTS1_11_11_X1 0.0432909 0.0207085 0.0515464
QS3_4_LIFEEVENTS1_16_16_X1 0.0331486 0.0213620 0.0627929
QS2_9_PRESENCEOFA_Yes 0.0324314 0.0284243 0.0393627
QS2_3_PRESENCEOFM_Yes 0.0084833 0.0035442 0.0112465