import requests
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn import linear_model as sklm
import xgboost as xgb
import scipy as sp
%matplotlib inline
FHIR_API = "http://gk-fhir-server-gatekeeper-pilot1.apps.okd.seclab.local/gk-fhir-server/fhir"
%%time
subjects = requests.get(f"{FHIR_API}/ResearchSubject", {'_tag': 'robusto-et-al-2018-dataset', '_count': 200}).json()
tabella = []
i = 0
while subjects:
for s in subjects['entry']:
riga = {}
tabella.append(riga)
riga['CASECTRL'] = '1' if s['resource']['actualArm'] == 'case' else '2'
patient = requests.get(f"{FHIR_API}/{s['resource']['individual']['reference']}").json()
riga['SESSO'] = '1' if patient['gender'] == 'male' else '2'
riga['COMURESI'] = patient['address'][0]['postalCode']
if 'generalPractitioner' in patient:
riga['MEDICO'] = patient['generalPractitioner'][0]['identifier']['value']
else:
riga['MEDICO'] = np.NaN
observations = requests.get(f"{FHIR_API}/Observation", {'subject': patient['id']}).json()['entry']
for o in observations:
if o['resource']['code']['coding'][0]['code'] == '30525-0':
riga['AGE'] = o['resource']['valueQuantity']['value']
elif o['resource']['code']['coding'][0]['code'] == 'risk-score-ddci':
riga['F_DDCI_PRE'] = o['resource']['valueInteger']
elif o['resource']['code']['coding'][0]['code'] == 'risk-score-cci':
riga['H_CHARL_PRE'] = o['resource']['valueInteger']
elif o['resource']['code']['coding'][0]['code'] == 'healthcare-costs-drugs':
if o['resource']['effectivePeriod']['start'] == '2011-01-01':
riga['F_SPESA_PRE'] = o['resource']['valueQuantity']['value']
riga['INDXDATA'] = o['resource']['effectivePeriod']['end']
else:
riga['F_SPESA_POST'] = o['resource']['valueQuantity']['value']
riga['LASTDATA'] = o['resource']['effectivePeriod']['end']
elif o['resource']['code']['coding'][0]['code'] == 'healthcare-costs-hospitalizations':
if o['resource']['effectivePeriod']['start'] == '2011-01-01':
riga['H_SPTOT_PRE'] = o['resource']['valueQuantity']['value']
else:
riga['H_SPTOT_POST'] = o['resource']['valueQuantity']['value']
elif o['resource']['code']['coding'][0]['code'] == 'healthcare-costs-outpatient-visits':
if o['resource']['effectivePeriod']['start'] == '2011-01-01':
riga['S_SPESA_PRE'] = o['resource']['valueQuantity']['value']
else:
riga['S_SPESA_POST'] = o['resource']['valueQuantity']['value']
print(f"Page #{i} loaded from FHIR server")
i += 1
new_subjects = None
for l in subjects['link']:
if l['relation'] == 'next':
new_subjects = requests.get(l['url']).json()
subjects = new_subjects
print("Done")
frame = pd.DataFrame(tabella).astype({
'CASECTRL': 'category',
'SESSO': 'category',
'MEDICO': 'category',
'COMURESI': 'category',
'INDXDATA': 'datetime64[ns]',
'LASTDATA': 'datetime64[ns]'
})
frame
frame.dropna(inplace=True) # drops 9 with missing value for MEDICO
frame.shape
frame['PRE_DAYS'] = frame.INDXDATA - np.datetime64('2011-01-01')
frame['POST_DAYS'] = frame.LASTDATA - frame.INDXDATA
# only consider pre period for CCM cases
frame.loc[frame.CASECTRL == '1', ['F_SPESA_POST', 'H_SPTOT_POST', 'S_SPESA_POST']] = 0
frame.loc[frame.CASECTRL == '1', 'POST_DAYS'] = pd.Timedelta(days=0)
frame['SPESA_TOT'] = (
frame.F_SPESA_PRE + frame.H_SPTOT_PRE + frame.S_SPESA_PRE +
frame.F_SPESA_POST + frame.H_SPTOT_POST + frame.S_SPESA_POST
) / (frame.PRE_DAYS + frame.POST_DAYS).apply(lambda x: x.days)*365
frame.drop(columns = [
'CASECTRL',
'INDXDATA',
'F_SPESA_PRE',
'H_SPTOT_PRE',
'S_SPESA_PRE',
'F_SPESA_POST',
'H_SPTOT_POST',
'S_SPESA_POST',
'LASTDATA',
'PRE_DAYS',
'POST_DAYS'
], inplace=True)
frame
frame.dtypes
frame.hist()
frame = pd.get_dummies(frame) # one hot encodings for categorical (2 SESSO, 463 MEDICO, 64 COMURESI, 6 ASL, 13 DISTRETTO)
frame = frame.reindex(columns=np.roll(frame.columns.values, -4)) # replace SPESA_TOT at the end (get_dummies moved variables)
frame = frame.astype('float32')
print(frame.dtypes)
frame.to_csv("../data/tabella.csv", index=False)
def build_model(input_size):
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', kernel_initializer='normal', input_shape=(input_size,)),
tf.keras.layers.Dense(64, activation='relu', kernel_initializer='normal'),
tf.keras.layers.Dense(1, kernel_initializer='normal')
])
model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
return model
def fit_DNN(data, targets, epochs, batch_size):
print("\tTraining DNN model...")
folds_losses = []
folds_val_losses = []
for i, (train, test) in enumerate(sk.model_selection.KFold(n_splits=10).split(data)):
print(f"\tProcessing cross-validation fold #{i}")
model = build_model(data.shape[1])
history = model.fit(data[train], targets[train], validation_data=(data[test], targets[test]), epochs=epochs, batch_size=batch_size, verbose=0)
# reporting
history_dict = history.history
folds_losses.append(history_dict['loss'])
folds_val_losses.append(history_dict['val_loss'])
# compute training results
mean_loss_per_epoch = [np.mean([x[i] for x in folds_losses]) for i in range(epochs)]
mean_val_loss_per_epoch = [np.mean([x[i] for x in folds_val_losses]) for i in range(epochs)]
epoch_min = np.array(mean_val_loss_per_epoch).argmin()
# plot training results
epochs_range = range(11, epochs + 1)
plt.plot(epochs_range, mean_loss_per_epoch[10:], 'r', label="Training loss")
plt.plot(epochs_range, mean_val_loss_per_epoch[10:], 'b', label="Validation loss")
plt.title("Training and validation loss (mse)")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()
# output validation results
print(f"\tBest epoch: {epoch_min}, (loss, validation loss) at best epoch: {mean_loss_per_epoch[epoch_min], mean_val_loss_per_epoch[epoch_min]}")
# build and return final model
model = build_model(data.shape[1])
model.fit(data, targets, epochs=epoch_min, batch_size=batch_size, verbose=0)
return model
%%time
dataset = frame.values
N_SPLITS = 20
TRAIN_SIZE = 0.66
mse = []
mse_lr = []
mse_ri = []
mse_xgb = []
for i, (train, test) in enumerate(sk.model_selection.ShuffleSplit(n_splits=N_SPLITS, train_size=TRAIN_SIZE, random_state=0).split(dataset)):
print(f"Processing split #{i}...")
# separate train and test data and targets
train_data = dataset[train,0:-1]
train_targets = dataset[train,-1]
test_data = dataset[test,0:-1]
test_targets = dataset[test,-1]
# normalize data
mean = train_data[:,range(-3, 0)].mean(axis=0)
train_data[:,range(-3, 0)] -= mean
test_data[:,range(-3, 0)] -= mean
std = train_data[:,range(-3, 0)].std(axis=0)
train_data[:,range(-3, 0)] /= std
test_data[:,range(-3, 0)] /= std
model = fit_DNN(train_data, train_targets, epochs=50, batch_size=16)
mse.append(sk.metrics.mean_squared_error(model.predict(test_data).squeeze(), test_targets))
print(f"MSE DNN (sample {i+1}): {mse[-1]}")
model_lr = sklm.LinearRegression().fit(train_data, train_targets)
mse_lr.append(sk.metrics.mean_squared_error(model_lr.predict(test_data), test_targets))
print(f"MSE Linear Regression (sample {i+1}): {mse_lr[-1]}")
model_ri = sklm.Ridge().fit(train_data, train_targets)
mse_ri.append(sk.metrics.mean_squared_error(model_ri.predict(test_data), test_targets))
print(f"MSE Ridge Regression (sample {i+1}): {mse_ri[-1]}")
model_xgb = xgb.XGBRegressor().fit(train_data, train_targets)
mse_xgb.append(sk.metrics.mean_squared_error(model_xgb.predict(test_data), test_targets))
print(f"MSE XGBoost (sample {i+1}): {mse_xgb[-1]}")
def output_results(difference):
print(f"MSE difference - Mean: {np.mean(difference)}, SEM: {sp.stats.sem(difference)}") # SEM = standard error of the mean = corrected sample std / sqrt(n)
print(f"CI 95%: {sp.stats.t.interval(alpha=0.95,df=N_SPLITS-1,loc=np.mean(difference),scale=sp.stats.sem(difference))}")
print(f"p-value: {sp.stats.ttest_1samp(difference, popmean=0).pvalue}")
# corrected SEM as per Nadeau and Bengio (2003). https://doi.org/10.1023/A:1024068626366
corrected_sem = sp.stats.sem(difference)*np.sqrt(1+N_SPLITS*(1-TRAIN_SIZE)/TRAIN_SIZE)
print(f"corrected CI 95%: {sp.stats.t.interval(alpha=0.95,df=N_SPLITS-1,loc=np.mean(difference),scale=corrected_sem)}")
print(f"corrected p-value: {(1-sp.stats.t.cdf(np.mean(difference)/corrected_sem, df=N_SPLITS-1))*2}")
print("Comparison with Linear Regression")
output_results(np.array(mse_lr) - np.array(mse))
print("\nComparison with Ridge Regression")
output_results(np.array(mse_ri) - np.array(mse))
print("\nComparison with XGBoost")
output_results(np.array(mse_xgb) - np.array(mse))