In [1]:
import requests
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import linear_model as sklm 
import xgboost as xgb
import scipy as sp

%matplotlib inline

FHIR_API = "http://gk-fhir-server-gatekeeper-pilot1.apps.okd.seclab.local/gk-fhir-server/fhir"

Data preparation

Reading data from GATEKEEPER Data Federation

In [2]:
%%time

subjects = requests.get(f"{FHIR_API}/ResearchSubject", {'_tag': 'robusto-et-al-2018-dataset', '_count': 200}).json()
tabella = []
i = 0
while subjects:
    for s in subjects['entry']:
        riga = {}
        tabella.append(riga)
        riga['CASECTRL'] = '1' if s['resource']['actualArm'] == 'case' else '2'
        patient = requests.get(f"{FHIR_API}/{s['resource']['individual']['reference']}").json()
        riga['SESSO'] = '1' if patient['gender'] == 'male' else '2'
        riga['COMURESI'] = patient['address'][0]['postalCode']
        if 'generalPractitioner' in patient:
            riga['MEDICO'] = patient['generalPractitioner'][0]['identifier']['value']
        else:
            riga['MEDICO'] = np.NaN
        observations = requests.get(f"{FHIR_API}/Observation", {'subject': patient['id']}).json()['entry']
        for o in observations:
            if o['resource']['code']['coding'][0]['code'] == '30525-0':
                riga['AGE'] = o['resource']['valueQuantity']['value']
            elif o['resource']['code']['coding'][0]['code'] == 'risk-score-ddci':
                riga['F_DDCI_PRE'] = o['resource']['valueInteger']
            elif o['resource']['code']['coding'][0]['code'] == 'risk-score-cci':
                riga['H_CHARL_PRE'] = o['resource']['valueInteger']
            elif o['resource']['code']['coding'][0]['code'] == 'healthcare-costs-drugs':
                if o['resource']['effectivePeriod']['start'] == '2011-01-01':
                    riga['F_SPESA_PRE'] = o['resource']['valueQuantity']['value']
                    riga['INDXDATA'] = o['resource']['effectivePeriod']['end']
                else:
                    riga['F_SPESA_POST'] = o['resource']['valueQuantity']['value']
                    riga['LASTDATA'] = o['resource']['effectivePeriod']['end']
            elif o['resource']['code']['coding'][0]['code'] == 'healthcare-costs-hospitalizations':
                if o['resource']['effectivePeriod']['start'] == '2011-01-01':
                    riga['H_SPTOT_PRE'] = o['resource']['valueQuantity']['value']
                else:
                    riga['H_SPTOT_POST'] = o['resource']['valueQuantity']['value']
            elif o['resource']['code']['coding'][0]['code'] == 'healthcare-costs-outpatient-visits':
                if o['resource']['effectivePeriod']['start'] == '2011-01-01':
                    riga['S_SPESA_PRE'] = o['resource']['valueQuantity']['value']
                else:
                    riga['S_SPESA_POST'] = o['resource']['valueQuantity']['value']
    print(f"Page #{i} loaded from FHIR server")  
    i += 1
    new_subjects = None
    for l in subjects['link']:
        if l['relation'] == 'next':
            new_subjects = requests.get(l['url']).json()
    subjects = new_subjects
print("Done")

frame = pd.DataFrame(tabella).astype({
    'CASECTRL': 'category',
    'SESSO': 'category',
    'MEDICO': 'category',
    'COMURESI': 'category',
    'INDXDATA': 'datetime64[ns]',
    'LASTDATA': 'datetime64[ns]'
})
Page #0 loaded from FHIR server
Page #1 loaded from FHIR server
Page #2 loaded from FHIR server
Page #3 loaded from FHIR server
Page #4 loaded from FHIR server
Page #5 loaded from FHIR server
Page #6 loaded from FHIR server
Page #7 loaded from FHIR server
Page #8 loaded from FHIR server
Page #9 loaded from FHIR server
Page #10 loaded from FHIR server
Page #11 loaded from FHIR server
Page #12 loaded from FHIR server
Page #13 loaded from FHIR server
Page #14 loaded from FHIR server
Page #15 loaded from FHIR server
Page #16 loaded from FHIR server
Done
CPU times: user 11.1 s, sys: 5.28 s, total: 16.4 s
Wall time: 4min 18s
In [3]:
frame
Out[3]:
CASECTRL SESSO COMURESI MEDICO AGE F_DDCI_PRE H_CHARL_PRE F_SPESA_PRE INDXDATA F_SPESA_POST LASTDATA H_SPTOT_PRE H_SPTOT_POST S_SPESA_PRE S_SPESA_POST
0 1 1 072042 120219 47.96 0 0 289.84 2012-05-09 416.56 2014-12-31 0.00 0.00 87.32 229.66
1 2 1 072042 120243 47.90 0 0 65.48 2012-01-01 178.90 2014-12-31 0.00 0.00 0.00 709.66
2 2 1 072042 120243 46.70 0 0 41.15 2012-01-01 142.09 2014-12-31 0.00 0.00 175.81 32.02
3 1 1 072042 530903 46.82 1 0 79.64 2012-04-23 236.89 2014-12-31 0.00 0.00 74.62 117.24
4 1 1 072042 120219 49.78 1 0 1246.37 2012-04-06 2384.33 2014-12-31 0.00 2632.77 677.36 936.25
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3205 2 2 073015 512543 77.82 7 0 451.76 2012-07-01 1017.14 2014-12-31 0.00 8707.30 88.16 1571.60
3206 2 2 073021 517459 75.06 7 1 2347.01 2012-07-01 2550.36 2014-12-31 3639.51 855.00 319.19 792.45
3207 1 2 073021 928258 62.12 1 0 669.54 2013-05-21 1152.84 2014-12-31 0.00 0.00 821.12 244.72
3208 2 2 073019 517437 60.99 1 0 689.63 2013-01-01 333.60 2014-12-31 0.00 0.00 735.68 302.64
3209 2 2 073021 517038 60.80 1 0 103.66 2013-01-01 0.00 2014-12-31 0.00 0.00 0.00 0.00

3210 rows × 15 columns

Cleaning and transforming data

In [4]:
frame.dropna(inplace=True) # drops 9 with missing value for MEDICO
frame.shape
Out[4]:
(3201, 15)
In [5]:
frame['PRE_DAYS'] = frame.INDXDATA - np.datetime64('2011-01-01')
frame['POST_DAYS'] = frame.LASTDATA - frame.INDXDATA
In [6]:
# only consider pre period for CCM cases
frame.loc[frame.CASECTRL == '1', ['F_SPESA_POST', 'H_SPTOT_POST', 'S_SPESA_POST']] = 0
frame.loc[frame.CASECTRL == '1', 'POST_DAYS'] = pd.Timedelta(days=0)
In [7]:
frame['SPESA_TOT'] = (
        frame.F_SPESA_PRE + frame.H_SPTOT_PRE + frame.S_SPESA_PRE +
        frame.F_SPESA_POST + frame.H_SPTOT_POST + frame.S_SPESA_POST
    ) / (frame.PRE_DAYS + frame.POST_DAYS).apply(lambda x: x.days)*365
frame.drop(columns = [
    'CASECTRL',
    'INDXDATA',
    'F_SPESA_PRE',
    'H_SPTOT_PRE',
    'S_SPESA_PRE',
    'F_SPESA_POST',
    'H_SPTOT_POST',
    'S_SPESA_POST',
    'LASTDATA',
    'PRE_DAYS',
    'POST_DAYS'
], inplace=True)
In [8]:
frame
Out[8]:
SESSO COMURESI MEDICO AGE F_DDCI_PRE H_CHARL_PRE SPESA_TOT
0 1 072042 120219 47.96 0 0 278.670850
1 1 072042 120243 47.90 0 0 238.510000
2 1 072042 120243 46.70 0 0 97.767500
3 1 072042 530903 46.82 1 0 117.792678
4 1 072042 120219 49.78 1 0 1523.126790
... ... ... ... ... ... ... ...
3205 2 073015 512543 77.82 7 0 2958.990000
3206 2 073021 517459 75.06 7 1 2625.880000
3207 2 073021 928258 62.12 1 0 624.673823
3208 2 073019 517437 60.99 1 0 515.387500
3209 2 073021 517038 60.80 1 0 25.915000

3201 rows × 7 columns

In [9]:
frame.dtypes
Out[9]:
SESSO          category
COMURESI       category
MEDICO         category
AGE             float64
F_DDCI_PRE        int64
H_CHARL_PRE       int64
SPESA_TOT       float64
dtype: object
In [10]:
frame.hist()
Out[10]:
array([[<AxesSubplot:title={'center':'AGE'}>,
        <AxesSubplot:title={'center':'F_DDCI_PRE'}>],
       [<AxesSubplot:title={'center':'H_CHARL_PRE'}>,
        <AxesSubplot:title={'center':'SPESA_TOT'}>]], dtype=object)

Preparing tensors

In [11]:
frame = pd.get_dummies(frame) # one hot encodings for categorical (2 SESSO, 463 MEDICO, 64 COMURESI, 6 ASL, 13 DISTRETTO)
frame = frame.reindex(columns=np.roll(frame.columns.values, -4)) # replace SPESA_TOT at the end (get_dummies moved variables)
frame = frame.astype('float32')
In [12]:
print(frame.dtypes)
SESSO_1            float32
SESSO_2            float32
COMURESI_071046    float32
COMURESI_071047    float32
COMURESI_071049    float32
                    ...   
MEDICO_929672      float32
AGE                float32
F_DDCI_PRE         float32
H_CHARL_PRE        float32
SPESA_TOT          float32
Length: 539, dtype: object
In [13]:
frame.to_csv("../data/tabella.csv", index=False)

Building and training the DNN model

In [14]:
def build_model(input_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', kernel_initializer='normal', input_shape=(input_size,)),
        tf.keras.layers.Dense(64, activation='relu', kernel_initializer='normal'),
        tf.keras.layers.Dense(1, kernel_initializer='normal')
    ])
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
    return model
In [15]:
def fit_DNN(data, targets, epochs, batch_size):
    print("\tTraining DNN model...")
    
    folds_losses = []
    folds_val_losses = []
    for i, (train, test) in enumerate(sk.model_selection.KFold(n_splits=10).split(data)):
        print(f"\tProcessing cross-validation fold #{i}")
        model = build_model(data.shape[1])
        history = model.fit(data[train], targets[train], validation_data=(data[test], targets[test]), epochs=epochs, batch_size=batch_size, verbose=0)

        # reporting
        history_dict = history.history
        folds_losses.append(history_dict['loss'])
        folds_val_losses.append(history_dict['val_loss'])

    # compute training results
    mean_loss_per_epoch = [np.mean([x[i] for x in folds_losses]) for i in range(epochs)]
    mean_val_loss_per_epoch = [np.mean([x[i] for x in folds_val_losses]) for i in range(epochs)]
    epoch_min = np.array(mean_val_loss_per_epoch).argmin()

    # plot training results
    epochs_range = range(11, epochs + 1)
    plt.plot(epochs_range, mean_loss_per_epoch[10:], 'r', label="Training loss")
    plt.plot(epochs_range, mean_val_loss_per_epoch[10:], 'b', label="Validation loss")
    plt.title("Training and validation loss (mse)")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

    # output validation results
    print(f"\tBest epoch: {epoch_min}, (loss, validation loss) at best epoch: {mean_loss_per_epoch[epoch_min], mean_val_loss_per_epoch[epoch_min]}")
        
    # build and return final model
    model = build_model(data.shape[1])
    model.fit(data, targets, epochs=epoch_min, batch_size=batch_size, verbose=0)
    return model

Execute comparison

In [16]:
%%time

dataset = frame.values
N_SPLITS = 20
TRAIN_SIZE = 0.66

mse = []
mse_lr = []
mse_ri = []
mse_xgb = []
for i, (train, test) in enumerate(sk.model_selection.ShuffleSplit(n_splits=N_SPLITS, train_size=TRAIN_SIZE, random_state=0).split(dataset)):
    print(f"Processing split #{i}...")
    
    # separate train and test data and targets
    train_data = dataset[train,0:-1]
    train_targets = dataset[train,-1]
    test_data = dataset[test,0:-1]
    test_targets = dataset[test,-1]
    
    # normalize data
    mean = train_data[:,range(-3, 0)].mean(axis=0)
    train_data[:,range(-3, 0)] -= mean
    test_data[:,range(-3, 0)] -= mean
    std = train_data[:,range(-3, 0)].std(axis=0)
    train_data[:,range(-3, 0)] /= std
    test_data[:,range(-3, 0)] /= std
    
    model = fit_DNN(train_data, train_targets, epochs=50, batch_size=16)
    mse.append(sk.metrics.mean_squared_error(model.predict(test_data).squeeze(), test_targets))
    print(f"MSE DNN (sample {i+1}): {mse[-1]}")

    model_lr = sklm.LinearRegression().fit(train_data, train_targets)
    mse_lr.append(sk.metrics.mean_squared_error(model_lr.predict(test_data), test_targets))
    print(f"MSE Linear Regression (sample {i+1}): {mse_lr[-1]}")

    model_ri = sklm.Ridge().fit(train_data, train_targets)
    mse_ri.append(sk.metrics.mean_squared_error(model_ri.predict(test_data), test_targets))
    print(f"MSE Ridge Regression (sample {i+1}): {mse_ri[-1]}")

    model_xgb = xgb.XGBRegressor().fit(train_data, train_targets)
    mse_xgb.append(sk.metrics.mean_squared_error(model_xgb.predict(test_data), test_targets))
    print(f"MSE XGBoost (sample {i+1}): {mse_xgb[-1]}")
Processing split #0...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 19, (loss, validation loss) at best epoch: (6711199.75, 6973947.375)
MSE DNN (sample 1): 10598767.0
MSE Linear Regression (sample 1): 78104265490432.0
MSE Ridge Regression (sample 1): 11090099.0
MSE XGBoost (sample 1): 10832469.0
Processing split #1...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 19, (loss, validation loss) at best epoch: (7335929.9, 7592091.3)
MSE DNN (sample 2): 9146529.0
MSE Linear Regression (sample 2): 166354432.0
MSE Ridge Regression (sample 2): 9792174.0
MSE XGBoost (sample 2): 10219625.0
Processing split #2...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 18, (loss, validation loss) at best epoch: (8240191.8, 8471092.725)
MSE DNN (sample 3): 7623442.0
MSE Linear Regression (sample 3): 2784233472.0
MSE Ridge Regression (sample 3): 8200034.0
MSE XGBoost (sample 3): 8903619.0
Processing split #3...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 19, (loss, validation loss) at best epoch: (8084366.85, 8350298.525)
MSE DNN (sample 4): 7798340.0
MSE Linear Regression (sample 4): 663094165504.0
MSE Ridge Regression (sample 4): 8837839.0
MSE XGBoost (sample 4): 10573329.0
Processing split #4...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 16, (loss, validation loss) at best epoch: (6560185.6, 6772657.3)
MSE DNN (sample 5): 10854652.0
MSE Linear Regression (sample 5): 969861038080.0
MSE Ridge Regression (sample 5): 11370690.0
MSE XGBoost (sample 5): 12366886.0
Processing split #5...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 22, (loss, validation loss) at best epoch: (8079659.85, 8417881.8)
MSE DNN (sample 6): 7635150.0
MSE Linear Regression (sample 6): 89894699008.0
MSE Ridge Regression (sample 6): 9352565.0
MSE XGBoost (sample 6): 10114544.0
Processing split #6...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 19, (loss, validation loss) at best epoch: (9068574.7, 9355743.15)
MSE DNN (sample 7): 5776131.0
MSE Linear Regression (sample 7): 2272728711168.0
MSE Ridge Regression (sample 7): 6492246.5
MSE XGBoost (sample 7): 6886622.5
Processing split #7...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 18, (loss, validation loss) at best epoch: (7983305.35, 8250985.85)
MSE DNN (sample 8): 7986799.5
MSE Linear Regression (sample 8): 1481917595648.0
MSE Ridge Regression (sample 8): 8737938.0
MSE XGBoost (sample 8): 9127824.0
Processing split #8...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 18, (loss, validation loss) at best epoch: (6813375.15, 7055496.9)
MSE DNN (sample 9): 10342450.0
MSE Linear Regression (sample 9): 10049022853120.0
MSE Ridge Regression (sample 9): 11008429.0
MSE XGBoost (sample 9): 10906753.0
Processing split #9...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 20, (loss, validation loss) at best epoch: (8351665.2, 8669208.125)
MSE DNN (sample 10): 7116965.5
MSE Linear Regression (sample 10): 141034160128.0
MSE Ridge Regression (sample 10): 8810520.0
MSE XGBoost (sample 10): 8729725.0
Processing split #10...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 18, (loss, validation loss) at best epoch: (8244271.65, 8519899.725)
MSE DNN (sample 11): 7621656.5
MSE Linear Regression (sample 11): 16729680904192.0
MSE Ridge Regression (sample 11): 8348842.0
MSE XGBoost (sample 11): 8758860.0
Processing split #11...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 20, (loss, validation loss) at best epoch: (7698964.0, 7995148.85)
MSE DNN (sample 12): 8509079.0
MSE Linear Regression (sample 12): 165274435584.0
MSE Ridge Regression (sample 12): 9442993.0
MSE XGBoost (sample 12): 11004817.0
Processing split #12...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 19, (loss, validation loss) at best epoch: (7545661.45, 7824223.45)
MSE DNN (sample 13): 8770309.0
MSE Linear Regression (sample 13): 13571843948544.0
MSE Ridge Regression (sample 13): 9430155.0
MSE XGBoost (sample 13): 9957452.0
Processing split #13...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 20, (loss, validation loss) at best epoch: (9474138.2, 9820534.2)
MSE DNN (sample 14): 4876786.0
MSE Linear Regression (sample 14): 104572256.0
MSE Ridge Regression (sample 14): 6731767.5
MSE XGBoost (sample 14): 6911019.5
Processing split #14...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 20, (loss, validation loss) at best epoch: (7418593.35, 7701030.225)
MSE DNN (sample 15): 8981461.0
MSE Linear Regression (sample 15): 3775750471680.0
MSE Ridge Regression (sample 15): 9997636.0
MSE XGBoost (sample 15): 10549466.0
Processing split #15...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 21, (loss, validation loss) at best epoch: (9109360.5, 9491556.1)
MSE DNN (sample 16): 5596969.5
MSE Linear Regression (sample 16): 3366612107264.0
MSE Ridge Regression (sample 16): 6993876.0
MSE XGBoost (sample 16): 7485041.0
Processing split #16...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 19, (loss, validation loss) at best epoch: (6844388.45, 7152650.975)
MSE DNN (sample 17): 10131458.0
MSE Linear Regression (sample 17): 12004952637440.0
MSE Ridge Regression (sample 17): 10577602.0
MSE XGBoost (sample 17): 11264083.0
Processing split #17...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 18, (loss, validation loss) at best epoch: (6743840.25, 6966404.15)
MSE DNN (sample 18): 10422593.0
MSE Linear Regression (sample 18): 65336611176448.0
MSE Ridge Regression (sample 18): 11012453.0
MSE XGBoost (sample 18): 12020359.0
Processing split #18...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 22, (loss, validation loss) at best epoch: (8321538.2, 8632629.65)
MSE DNN (sample 19): 7149908.5
MSE Linear Regression (sample 19): 2151882162176.0
MSE Ridge Regression (sample 19): 8071489.5
MSE XGBoost (sample 19): 11426367.0
Processing split #19...
	Training DNN model...
	Processing cross-validation fold #0
	Processing cross-validation fold #1
	Processing cross-validation fold #2
	Processing cross-validation fold #3
	Processing cross-validation fold #4
	Processing cross-validation fold #5
	Processing cross-validation fold #6
	Processing cross-validation fold #7
	Processing cross-validation fold #8
	Processing cross-validation fold #9
	Best epoch: 22, (loss, validation loss) at best epoch: (8412823.1, 8777721.075)
MSE DNN (sample 20): 6883988.5
MSE Linear Regression (sample 20): 3699048185856.0
MSE Ridge Regression (sample 20): 8155014.0
MSE XGBoost (sample 20): 9120137.0
CPU times: user 41min 56s, sys: 4min 27s, total: 46min 24s
Wall time: 29min 45s

Output comparison results

In [17]:
def output_results(difference):
    print(f"MSE difference - Mean: {np.mean(difference)}, SEM: {sp.stats.sem(difference)}") # SEM = standard error of the mean = corrected sample std / sqrt(n)
    print(f"CI 95%: {sp.stats.t.interval(alpha=0.95,df=N_SPLITS-1,loc=np.mean(difference),scale=sp.stats.sem(difference))}")
    print(f"p-value: {sp.stats.ttest_1samp(difference, popmean=0).pvalue}")
    # corrected SEM as per Nadeau and Bengio (2003). https://doi.org/10.1023/A:1024068626366
    corrected_sem = sp.stats.sem(difference)*np.sqrt(1+N_SPLITS*(1-TRAIN_SIZE)/TRAIN_SIZE)
    print(f"corrected CI 95%: {sp.stats.t.interval(alpha=0.95,df=N_SPLITS-1,loc=np.mean(difference),scale=corrected_sem)}")
    print(f"corrected p-value: {(1-sp.stats.t.cdf(np.mean(difference)/corrected_sem, df=N_SPLITS-1))*2}")
In [18]:
print("Comparison with Linear Regression")
output_results(np.array(mse_lr) - np.array(mse))
print("\nComparison with Ridge Regression")
output_results(np.array(mse_ri) - np.array(mse))
print("\nComparison with XGBoost")
output_results(np.array(mse_xgb) - np.array(mse))
Comparison with Linear Regression
MSE difference - Mean: 10728817819648.0, SEM: 4820153581035.3125
CI 95%: (640120428598.9629, 20817515210697.04)
p-value: 0.038328013541210196
corrected CI 95%: (-23189362258103.434, 44646997897399.44)
corrected p-value: 0.5158869120295191

Comparison with Ridge Regression
MSE difference - Mean: 931546.375, SEM: 97242.31986145888
CI 95%: (728015.8604235041, 1135076.8895764959)
p-value: 1.0463795704836338e-08
corrected CI 95%: (247277.19959110476, 1615815.5504088951)
corrected p-value: 0.010255968179692099

Comparison with XGBoost
MSE difference - Mean: 1666778.0, SEM: 199264.0812951501
CI 95%: (1249713.4846696872, 2083842.5153303128)
p-value: 8.590339499361529e-08
corrected CI 95%: (264607.94903143356, 3068948.0509685664)
corrected p-value: 0.0222967064591022
In [ ]: