import numpy as np
np.random.seed(62)
import pandas as pd
import random as rd
rd.seed(62)
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('Life_Expectancy_Data.csv')
datasets = {'raw':df}
features = list(df.columns)
print(f'Shape : {df.shape}')
print(f"\nFeatures : {features}")
print("\nHere's a small description of the target variable (Life Expectancy)")
TARGET = 'Life expectancy '
print(df[TARGET].describe())

Shape : (2938, 22)

Features : ['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']

Here's a small description of the target variable (Life Expectancy)
count    2928.000000
mean       69.224932
std         9.523867
min        36.300000
25%        63.100000
50%        72.100000
75%        75.700000
max        89.000000
Name: Life expectancy , dtype: float64

country_mask = np.where(df['Country'] == 'Bangladesh', True, False)
sns.pointplot(data=df.iloc[country_mask,:], x='Year', y=TARGET)
plt.xticks(rotation=90)
plt.show()

features.remove('Country')
features.remove('Year')
df_2 = df[features]
datasets.update({'raw_no_Country_Year':df_2})

def plot_distribution(df, col):
    plt.figure()
    plt.hist(df[col])
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Number of observations')
    plt.show()

df = datasets['raw_no_Country_Year']
fig, axes = plt.subplots(5,4, figsize=(12,24), sharey=False)
for i, col in enumerate(list(df.columns)):
    row = i // 4
    col_index = i % 4

    sns.histplot(df[col], kde=False, ax=axes[row, col_index])
    axes[row, col_index].set_title(f'Distribution of {col}')
    axes[row, col_index].set_xlabel(col)
    if col_index == 0:
        axes[row, col_index].set_ylabel('Count')
    else:
        axes[row, col_index].set_ylabel('')
plt.tight_layout()
plt.show()

def look_NAs(df, col):
    NAs = np.where(pd.isna(df[col]), 1, 0).sum()
    percentage = np.where(pd.isna(df[col]), 1, 0).mean() * 100
    print(f'\nThere are {NAs} NAs in {col}.')
    print(f'This represents roughly {percentage} % of missing values.')
    if NAs == 0:
        return False
    return True

cols_with_NAs = []
cols_no_NAs = []
for col in features:
    if look_NAs(df, col):
        cols_with_NAs.append(col)
    else:
        cols_no_NAs.append(col)

There are 0 NAs in Status.
This represents roughly 0.0 % of missing values.

There are 10 NAs in Life expectancy .
This represents roughly 0.3403675970047651 % of missing values.

There are 10 NAs in Adult Mortality.
This represents roughly 0.3403675970047651 % of missing values.

There are 0 NAs in infant deaths.
This represents roughly 0.0 % of missing values.

There are 194 NAs in Alcohol.
This represents roughly 6.603131381892443 % of missing values.

There are 0 NAs in percentage expenditure.
This represents roughly 0.0 % of missing values.

There are 553 NAs in Hepatitis B.
This represents roughly 18.82232811436351 % of missing values.

There are 0 NAs in Measles .
This represents roughly 0.0 % of missing values.

There are 34 NAs in  BMI .
This represents roughly 1.1572498298162015 % of missing values.

There are 0 NAs in under-five deaths .
This represents roughly 0.0 % of missing values.

There are 19 NAs in Polio.
This represents roughly 0.6466984343090538 % of missing values.

There are 226 NAs in Total expenditure.
This represents roughly 7.6923076923076925 % of missing values.

There are 19 NAs in Diphtheria .
This represents roughly 0.6466984343090538 % of missing values.

There are 0 NAs in  HIV/AIDS.
This represents roughly 0.0 % of missing values.

There are 448 NAs in GDP.
This represents roughly 15.248468345813478 % of missing values.

There are 652 NAs in Population.
This represents roughly 22.19196732471069 % of missing values.

There are 34 NAs in  thinness  1-19 years.
This represents roughly 1.1572498298162015 % of missing values.

There are 34 NAs in  thinness 5-9 years.
This represents roughly 1.1572498298162015 % of missing values.

There are 167 NAs in Income composition of resources.
This represents roughly 5.684138869979578 % of missing values.

There are 163 NAs in Schooling.
This represents roughly 5.547991831177672 % of missing values.

from sklearn.impute import SimpleImputer, KNNImputer
mean_imp = SimpleImputer(strategy="mean")
med_imp = SimpleImputer(strategy="median")
knn_imp = KNNImputer(weights="distance") # close neighbors have greater influence on imputation value than further neighbors.

mean_imp_features = pd.DataFrame(mean_imp.fit_transform(datasets['raw_no_Country_Year'][cols_with_NAs]), columns=cols_with_NAs)
mean_df = pd.concat([datasets['raw_no_Country_Year'][cols_no_NAs], mean_imp_features], axis=1)
print(f'\nThere are {np.where(pd.isna(mean_df), 1, 0).sum()} NAs in this dataframe')
datasets.update({'mean_imputed':mean_df})

med_imp_features = pd.DataFrame(med_imp.fit_transform(datasets['raw_no_Country_Year'][cols_with_NAs]), columns=cols_with_NAs)
med_df = pd.concat([datasets['raw_no_Country_Year'][cols_no_NAs], med_imp_features], axis=1)
print(f'\nThere are {np.where(pd.isna(med_df), 1, 0).sum()} NAs in this dataframe')
datasets.update({'median_imputed':med_df})

knn_imp_features = pd.DataFrame(knn_imp.fit_transform(datasets['raw_no_Country_Year'][cols_with_NAs]), columns=cols_with_NAs)
knn_df = pd.concat([datasets['raw_no_Country_Year'][cols_no_NAs], knn_imp_features], axis=1)
print(f'\nThere are {np.where(pd.isna(knn_df), 1, 0).sum()} NAs in this dataframe')
datasets.update({'knn_imputed':knn_df})

There are 0 NAs in this dataframe

There are 0 NAs in this dataframe

There are 0 NAs in this dataframe

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
scaler = MinMaxScaler()
enc = OneHotEncoder(sparse_output=False)

def preprocess_data(df, cat, target):
    numerical = [col for col in df.columns if col not in cat and col != target]

    scaled_df = pd.DataFrame(scaler.fit_transform(df[numerical]), columns=numerical)
    one_hot_df = pd.DataFrame(enc.fit_transform(df[cat]), columns=list(np.unique(df[cat])))

    return pd.concat([scaled_df, one_hot_df, df[target]], axis=1)

for imputed_df in ['mean_imputed', 'median_imputed', 'knn_imputed']:
    df = datasets[imputed_df]
    
    df = preprocess_data(df, ['Status'], TARGET)
    
    datasets[imputed_df] = df

def make_sets(df_size, test_size=0.2):
    train_size = int(np.ceil(df_size * (1 - test_size)))
    train_set = rd.sample([i for i in range(df_size)], train_size)
    test_set = [i for i in range(df_size) if i not in train_set]
    rd.shuffle(test_set)
    return train_set, test_set

def get_data(df, row_set):
    return df.iloc[row_set,:]

def get_train_test(df, train, test, target):
    columns = list(df.columns)
    columns.remove(target)
    return get_data(df, train)[columns], get_data(df, train)[target], get_data(df, test)[columns], get_data(df, test)[target] 

train_set, test_set = make_sets(datasets['mean_imputed'].shape[0], test_size=0.2)

print(f'\nThere are {len(train_set)} observations in the training set.')
print(f'\nThere are {len(test_set)} observations in the test set.')

There are 2351 observations in the training set.

There are 587 observations in the test set.

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score

def cv_on_datasets(estimator, train_set, test_set, target, cv=5):
    dico = {}
    for imputed_df in ['mean_imputed', 'median_imputed', 'knn_imputed']:
        X_train, Y_train, X_test, Y_test = get_train_test(datasets[imputed_df], train_set, test_set, target)
        
        scores = cross_val_score(estimator, X_train, Y_train, cv=cv, scoring='neg_mean_absolute_percentage_error')

        perf = -scores.mean() * 100 # To make it a percentage
        dico.update({imputed_df:perf, 'std_'+imputed_df:np.abs(np.std(-np.array(scores)*100))})
    return dico

baseline_model = LinearRegression(n_jobs=-1)

baseline_perfs = cv_on_datasets(baseline_model, train_set, test_set, TARGET)
print(baseline_perfs)

results = {'Model':['LinearRegression'],
           'mean_imputed':[baseline_perfs['mean_imputed']],
           'std_mean_imputed':[baseline_perfs['std_mean_imputed']],
           'median_imputed':[baseline_perfs['median_imputed']],
           'std_median_imputed':[baseline_perfs['std_median_imputed']],
           'knn_imputed':[baseline_perfs['knn_imputed']],
           'std_knn_imputed':[baseline_perfs['std_knn_imputed']]}

{'mean_imputed': 4.6810660441156235, 'std_mean_imputed': 0.15182584715280947, 'median_imputed': 4.668883489956363, 'std_median_imputed': 0.1574078133628543, 'knn_imputed': 4.4674953856839075, 'std_knn_imputed': 0.11952634376293078}

to_compare = {'Ridge':Ridge(),
              'Lasso':Lasso(),
              'ElasticNet':ElasticNet(),
              'SVR(linear)':SVR(kernel='linear'),
              'SVR(gaussian)':SVR(kernel='rbf'),
              'RandomForestRegressor':RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
              'CatBoostRegressor':CatBoostRegressor(n_estimators=100, random_state=42, verbose=0, thread_count=-1)}

def compare_models(models_to_compare, train_set, test_set, target, result_dict, cv=5):
    for model_name in models_to_compare.keys():
        print(model_name)
        model = models_to_compare[model_name]

        mod_perfs = cv_on_datasets(model, train_set, test_set, target, cv=cv)

        result_dict['Model'].append(model_name)
        result_dict['mean_imputed'].append(mod_perfs['mean_imputed'])
        result_dict['std_mean_imputed'].append(mod_perfs['std_mean_imputed'])
        result_dict['median_imputed'].append(mod_perfs['median_imputed'])
        result_dict['std_median_imputed'].append(mod_perfs['std_median_imputed'])
        result_dict['knn_imputed'].append(mod_perfs['knn_imputed'])
        result_dict['std_knn_imputed'].append(mod_perfs['std_knn_imputed'])
    
    df = pd.DataFrame(result_dict)
    return df

def plot_performances(result_df):
    fig, axes = plt.subplots(3,1, figsize=(12,12), sharex=True)
    for i,method in enumerate(['mean_imputed', 'median_imputed', 'knn_imputed']):
        barplot = sns.barplot(data=result_df, y='Model', x=method, hue='Model', 
                              legend=False, palette='viridis', ax=axes[i], orient='h')
        
        for container in barplot.containers:
            barplot.bar_label(container, fmt='%.2f', label_type='center')
            
        # Manually add error bars
        for j, (x, err) in enumerate(zip(result_df[method], result_df['std_'+method])):
            barplot.errorbar(x=x, y=j, xerr=err, fmt='none', ecolor='black', capsize=3, capthick=1)
        axes[i].set_title(f'Model performances on {method} dataset (lower is better)')
        axes[i].set_ylabel('Models')
        axes[i].set_xlabel('MAPE')
    plt.tight_layout()
    plt.show()

results_df = compare_models(to_compare, train_set, test_set, TARGET, results, cv=5)

Ridge
Lasso
ElasticNet
SVR(linear)
SVR(gaussian)
RandomForestRegressor
CatBoostRegressor

plot_performances(results_df)

df = datasets['mean_imputed']
for i in range(7):
    sns.pairplot(df, x_vars=df.columns[3*i:3*(i+1)],y_vars=[TARGET])
plt.show()

correlations = df.corr()[TARGET].drop(TARGET).sort_values()

plt.figure(figsize=(10, 6))
sns.barplot(x=correlations.values, y=correlations.index, palette='coolwarm')
plt.title(f'Correlation of each feature with {TARGET}')
plt.xlabel('Correlation coefficient')
plt.ylabel('Features')
plt.show()

/tmp/ipykernel_367042/304857981.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=correlations.values, y=correlations.index, palette='coolwarm')

def select_features(estimator, training_data, trainset, testset, target, cv=5, scoring='neg_mean_absolute_percentage_error', min_improvement=1e-4, warm_start=([], 1e5)):
    X_train, Y_train, _, _ = get_train_test(training_data, trainset, testset, target)

    columns = X_train.columns

    perf_history = []
    old_perf = +np.inf
    perf = warm_start[1]
    n_features = 1
    kept = warm_start[0]
    best_model = None
    while perf <= old_perf - min_improvement:
        print(f'\nBest performance so far : {perf}')
        print(f'Best combination so far : {kept}')
        # Create combinations of features
        combinations = [kept + [col] for col in columns if col not in kept]

        if combinations == []:
            break
        # For each combination we use cross-validation to evaluate the model
        perf_list = []
        for comb in combinations:
            scores = cross_val_score(estimator, X_train[comb], Y_train, cv=cv, scoring=scoring, n_jobs=-1)
            if 'neg' in scoring:
                new_perf = -scores.mean()
            if scoring == 'neg_mean_absolute_percentage_error':
                new_perf = new_perf * 100
            perf_list.append(new_perf)
            perf_history.append((comb, new_perf))
        # We take the best performance and update the performance threshold
        old_perf = perf
        perf = np.min(perf_list)
        
        if perf < old_perf:
            # We update the best combination of features accordingly
            best_comb = combinations[np.argmin(perf_list)]
            kept = best_comb
        # We look for combinations of n+1 features
        n_features += 1
    return best_comb, perf_history

def plot_feature_selection(perf_history, best_comb=None, metric='Metric'):
    if best_comb:
        gradual = [best_comb[0:i+1] for i in range(len(best_comb) - 1)] + [best_comb]
        dico = {'Features':[str(gradual[0])] + ['+ ' + str(comb[-1]) for comb in gradual[1:]],
            metric: [t[1] for t in perf_history if t[0] in gradual]}
        print(dico)
    else:
        dico = {'Features':[str(t[0]) for t in perf_history],
                metric: [t[1] for t in perf_history]}
    df = pd.DataFrame(dico)
    plt.figure()
    ax = sns.barplot(data=df, y='Features', x=metric, palette='viridis', orient='h')
    for bars in ax.containers:
        ax.bar_label(bars)
    ax.set_xlim((0,1.2*np.max(dico[metric])))
    plt.show()

best_model = to_compare['RandomForestRegressor']

best_comb, perf_history = select_features(best_model, datasets['mean_imputed'], train_set, test_set, TARGET)

Best performance so far : 100000.0
Best combination so far : []

Best performance so far : 4.185676116439454
Best combination so far : ['Adult Mortality']

Best performance so far : 2.4876195262944667
Best combination so far : ['Adult Mortality', 'Income composition of resources']

Best performance so far : 2.0011636822189005
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years']

Best performance so far : 1.8434063994594623
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ']

Best performance so far : 1.791672165150825
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS']

Best performance so far : 1.7726989244402087
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing']

Best performance so far : 1.7583468669741678
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling']

Best performance so far : 1.7541577287333292
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths']

Best performance so far : 1.7505564009136694
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths', 'Developed']

print(f'\nSelected Features : {best_comb}')

df = datasets['mean_imputed']
datasets.update({'best_features':df[best_comb+[TARGET]]})

plot_feature_selection(perf_history, best_comb, metric='MAPE')

Selected Features : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths', 'Developed']
{'Features': ["['Adult Mortality']", '+ Income composition of resources', '+  thinness 5-9 years', '+ under-five deaths ', '+  HIV/AIDS', '+ Developing', '+ Schooling', '+ infant deaths', '+ Developed'], 'MAPE': [4.185676116439454, 2.4876195262944667, 2.0011636822189005, 1.8434063994594623, 1.791672165150825, 1.7726989244402087, 1.7583468669741678, 1.7541577287333292, 1.7505564009136694]}

/tmp/ipykernel_367042/606519076.py:53: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(data=df, y='Features', x=metric, palette='viridis', orient='h')

def add_interactions(df, target):
    columns = list(df.columns)
    if target in columns:
        columns.remove(target)
    int_dico = {}
    memory = []
    for col1 in columns:
        for col2 in columns:
            if col1 != col2:
                name = col1 + '_x_' + col2
                alt_name = col2 + '_x_' + col1
                if name not in memory and alt_name not in memory:
                    interaction = df[col1] * df[col2]
                    memory.append(name)
                    int_dico.update({name:interaction})
    int_df = pd.DataFrame(int_dico)
    return pd.concat([df[columns], int_df, df[target]], axis=1)

with_interactions = add_interactions(df, TARGET)
datasets.update({'best_features_w_interactions':with_interactions})

best_comb_int, perf_history_int = select_features(best_model, datasets['best_features_w_interactions'], 
                                                  train_set, test_set, TARGET, 
                                                  warm_start=(best_comb, [t[1] for t in perf_history if t[0] == best_comb][0]))

Best performance so far : 1.7505564009136694
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths', 'Developed']

Best performance so far : 1.7313922504356534
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths', 'Developed', 'Alcohol_x_Developing']

Best performance so far : 1.7252295553490373
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths', 'Developed', 'Alcohol_x_Developing', ' thinness 5-9 years_x_Schooling']

Best performance so far : 1.7180323809984268
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths', 'Developed', 'Alcohol_x_Developing', ' thinness 5-9 years_x_Schooling', 'Schooling_x_Developed']

Best performance so far : 1.7163105652964157
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths', 'Developed', 'Alcohol_x_Developing', ' thinness 5-9 years_x_Schooling', 'Schooling_x_Developed', ' HIV/AIDS_x_Schooling']

Best performance so far : 1.7107973701816115
Best combination so far : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths', 'Developed', 'Alcohol_x_Developing', ' thinness 5-9 years_x_Schooling', 'Schooling_x_Developed', ' HIV/AIDS_x_Schooling', 'Adult Mortality_x_Developing']

print(f'\nSelected Features : {best_comb_int}')
datasets.update({'best_features_w_interactions':datasets['best_features_w_interactions'][best_comb_int+[TARGET]]})

Selected Features : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths', 'Developed', 'Alcohol_x_Developing', ' thinness 5-9 years_x_Schooling', 'Schooling_x_Developed', ' HIV/AIDS_x_Schooling', 'Adult Mortality_x_Developing']

print(f'\nThe best combination is : {best_comb_int}')
print(f'New best performance : {[t[1] for t in perf_history_int if t[0] == best_comb_int][0]}')
print(f'Difference with previous best : {[t[1] for t in perf_history_int if t[0] == best_comb_int][0] - [t[1] for t in perf_history if t[0] == best_comb][0]}')

The best combination is : ['Adult Mortality', 'Income composition of resources', ' thinness 5-9 years', 'under-five deaths ', ' HIV/AIDS', 'Developing', 'Schooling', 'infant deaths', 'Developed', 'Alcohol_x_Developing', ' thinness 5-9 years_x_Schooling', 'Schooling_x_Developed', ' HIV/AIDS_x_Schooling', 'Adult Mortality_x_Developing']
New best performance : 1.7107973701816115
Difference with previous best : -0.03975903073205789

import pickle as pkl

with open('all_datasets.pkl', 'wb') as f:
    pkl.dump(datasets, f)

dico = {'best':best_comb, 'best_history':perf_history, 'best_w_interactions':best_comb_int, 'best_w_interactions_history':perf_history_int}

with open('feature_selection.pkl', 'wb') as f:
    pkl.dump(dico, f)

with open('all_datasets.pkl', 'rb') as f:
    datasets = pkl.load(f)

with open('feature_selection.pkl', 'rb') as f:
    feature_selection_results = pkl.load(f)
    best_comb, perf_history = feature_selection_results['best'], feature_selection_results['best_history']
    best_comb_int, perf_history_int = feature_selection_results['best_w_interactions'], feature_selection_results['best_w_interactions_history']

def cross_val(estimator, df, train, test, target, cv=5, scoring='neg_mean_absolute_percentage_error'):
    X_train, Y_train, _, _ = get_train_test(df, train_set, test_set, target)

    scores = cross_val_score(estimator, X_train, Y_train, cv=cv, scoring=scoring, n_jobs=-1)

    perf = -scores.mean() * 100
    return perf

perf_with_best_interactions = cross_val(best_model, datasets['best_features_w_interactions'], train_set, test_set, TARGET)

print(perf_with_best_interactions)
print(len(best_comb_int))

1.7107973701816117
14

import optuna

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 1000)
    max_depth = trial.suggest_int("max_depth", 4, 20)
    criterion = trial.suggest_categorical("criterion", ['squared_error', 'absolute_error', 'friedman_mse','poisson'])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        criterion=criterion,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        n_jobs=-1,
        random_state=42
    )

    perf = cross_val(model, datasets['best_features_w_interactions'], train_set, test_set, TARGET)
    print(perf)

    return perf

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[I 2024-11-01 12:53:25,046] A new study created in memory with name: no-name-b3121759-1c70-43bf-bff1-618348cf521c
[I 2024-11-01 12:53:39,865] Trial 0 finished with value: 1.8692153963421752 and parameters: {'n_estimators': 445, 'max_depth': 11, 'criterion': 'poisson', 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': None}. Best is trial 0 with value: 1.8692153963421752.

1.8692153963421752

[I 2024-11-01 12:53:44,310] Trial 1 finished with value: 2.1472938005501216 and parameters: {'n_estimators': 173, 'max_depth': 13, 'criterion': 'friedman_mse', 'min_samples_split': 9, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 0 with value: 1.8692153963421752.

2.1472938005501216

[I 2024-11-01 12:53:49,240] Trial 2 finished with value: 2.4273857598729514 and parameters: {'n_estimators': 471, 'max_depth': 7, 'criterion': 'squared_error', 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 0 with value: 1.8692153963421752.

2.4273857598729514

[I 2024-11-01 12:54:06,150] Trial 3 finished with value: 2.459105263400133 and parameters: {'n_estimators': 237, 'max_depth': 17, 'criterion': 'absolute_error', 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': 'log2'}. Best is trial 0 with value: 1.8692153963421752.

2.459105263400133

[I 2024-11-01 12:54:17,963] Trial 4 finished with value: 2.058120803032954 and parameters: {'n_estimators': 868, 'max_depth': 20, 'criterion': 'squared_error', 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'log2'}. Best is trial 0 with value: 1.8692153963421752.

2.058120803032954

trial = study.best_trial

print("MAPE: {}".format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

print(f'\nPerformance difference with default parameters is : {trial.value - [t[1] for t in perf_history_int if t[0] == best_comb_int][0]}')

MAPE: 1.6756511972615542
Best hyperparameters: {'n_estimators': 300, 'max_depth': 19, 'criterion': 'squared_error', 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt'}

Performance difference with default parameters is : -0.035146172920057284

from sklearn.metrics import mean_absolute_percentage_error, r2_score
best_model = RandomForestRegressor(**trial.params)

X_train, Y_train, X_test, Y_test = get_train_test(datasets['best_features_w_interactions'], train_set, test_set, TARGET)

best_model.fit(X_train, Y_train)

yhat = best_model.predict(X_test)

final_perf = mean_absolute_percentage_error(Y_test, yhat) * 100
r2 = r2_score(Y_test, yhat)

dico = {'True':Y_test, 'Predicted':yhat, 'Difference':np.abs(yhat - Y_test)}
df = pd.DataFrame(dico)

norm = plt.Normalize(df['Difference'].min(), df['Difference'].max())
sm = plt.cm.ScalarMappable(cmap="viridis", norm=norm)
sm.set_array([])

ax = sns.scatterplot(x='True', y='Predicted', data=df, palette='viridis', hue='Difference')
ax.get_legend().remove()
ax.figure.colorbar(sm, ax=ax, label='Absolute Error | yhat - Y_test |')

plt.title(f'{str(final_perf)[:4]} % average error\n Max error : {str(np.max(df['Difference']))[:4]}\n R2 : {str(r2)[:4]}')
plt.show()

feature_importances = best_model.feature_importances_
features = best_model.feature_names_in_
dico_features = {'Features':features, 'Importances':feature_importances}
df_features = pd.DataFrame(dico_features)
ax = sns.barplot(data=df_features, y='Features', x='Importances', orient='h', hue='Importances')
ax.get_legend().remove()
plt.show()

import shap 

explainer = shap.Explainer(best_model)
shap_values = explainer(X_test)

shap.plots.bar(shap_values, max_display=len(X_test.columns))

 
big_Life_expectancy = (df['True'] > 74)
low1 = df.loc[big_Life_expectancy]['Difference'].nsmallest(1).iloc[0]
idx1 = list(df['Difference']).index(low1)

small_Life_expectancy = (df['True'] < 59)
low2 = df.loc[small_Life_expectancy]['Difference'].nsmallest(1).iloc[0]
idx2 = list(df['Difference']).index(low2)

# Here are two observations of the test set accurately predicted by the model
plt.title(f'Country : {datasets['raw']['Country'].iloc[idx1]}, True Life expectancy : {df['True'].iloc[idx1]}')
shap.plots.waterfall(shap_values[idx1], max_display=len(X_test.columns)) # High Life expectancy true value
plt.title(f'Country : {datasets['raw']['Country'].iloc[idx2]}, True Life expectancy : {df['True'].iloc[idx2]}')
shap.plots.waterfall(shap_values[idx2], max_display=len(X_test.columns)) # Low Life expectancy true value

largests = list(df['Difference'].nlargest(2))
biggest_error_idx = list(df['Difference']).index(largests[0])
second_error_idx = list(df['Difference']).index(largests[1])

# Here are two observations of the test set badly predicted by the model
plt.title(f'Country : {datasets['raw']['Country'].iloc[biggest_error_idx]}, True Life expectancy : {df['True'].iloc[biggest_error_idx]}')
shap.plots.waterfall(shap_values[biggest_error_idx], max_display=len(X_test.columns)) # High Life expectancy true value

plt.title(f'Country : {datasets['raw']['Country'].iloc[second_error_idx]}, True Life expectancy : {df['True'].iloc[second_error_idx]}')
shap.plots.waterfall(shap_values[second_error_idx], max_display=len(X_test.columns)) # Low Life expectancy true value

shap.plots.beeswarm(shap_values, max_display=len(X_test.columns))

Predicting Life Expectancy¶

A - Data exploration and Preprocessing¶

B - Models¶

C - Optimization¶

D - Testing¶