The goal of this notebook is to be a reference (mostly for myself) of how different ML-libraries are used. The goal is not to dive deep into every library, but rather to show minimal examples of how to create a model, how to train it, and how to predict using the trained model. Since the focus is not on the model itself we will use linear regression to keep it simple.
When picking a library for a real problem, I would chose scikit-learn for most ML-models that are not deep neural networks. This is because it is a very high level library which means that we need to write few lines of code. I would chose either PyTorch or Tensorflow for most deep neural network models.
The source code can be found at https://github.com/CarlFredriksson/linear_regression_lib_comparison.
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import torch
import torch.nn as nn
import torch.optim as optim
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
print('np.__version__:', np.__version__)
print('pd.__version__:', pd.__version__)
print('matplotlib.__version__:', matplotlib.__version__)
print('sklearn.__version__:', sklearn.__version__)
print('torch.__version__:', torch.__version__)
print('tf.__version__:', tf.__version__)
We will use the diabetes dataset from sklearn.datasets, which you can read about in the scikit-learn documentation at https://scikit-learn.org/stable/datasets/index.html. To keep it as simple as possible we will only use one independent variable (the BMI of the patient which is the 3rd feature). Note from the documentation: Each of these 10 feature variables have been mean centered and scaled by the standard deviation times n_samples (i.e. the sum of squares of each column totals 1).
x_diabetes, y_diabetes = datasets.load_diabetes(return_X_y=True)
print(x_diabetes.shape)
print(y_diabetes.shape)
x_diabetes = np.expand_dims(x_diabetes[:, 2], axis=1)
x_diabetes.shape
y_diabetes = np.expand_dims(y_diabetes, axis=1)
y_diabetes.shape
x_train, x_test, y_train, y_test = train_test_split(x_diabetes, y_diabetes, test_size=1/3, random_state=42)
print(f'x_train.shape={x_train.shape} x_test.shape={x_test.shape} y_train.shape={y_train.shape} y_test.shape={y_test.shape}')
plt.scatter(x_train, y_train)
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
y_pred.shape
print('Coefficients:', lr.coef_)
print('Mean squared error (MSE) on test set:', mean_squared_error(y_test, y_pred))
print('Coefficient of determination (R^2) on test set:', r2_score(y_test, y_pred))
plt.scatter(x_test, y_test)
plt.plot(x_test, y_pred, color='red')
plt.show()
model = nn.Sequential(
nn.Linear(1, 1)
)
loss_func = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.03)
x = torch.from_numpy(x_train.astype('float32'))
y = torch.from_numpy(y_train.astype('float32'))
for epoch in range(30000):
optimizer.zero_grad()
y_pred = model(x)
loss = loss_func(y_pred, y)
loss.backward()
optimizer.step()
if epoch % 1000 == 0:
print(f'epoch: {epoch}, loss: {loss}')
y_pred = model(torch.from_numpy(x_test.astype('float32'))).data.numpy()
y_pred.shape
print('Coefficients:', [parameter.data.numpy()[0][0] for parameter in list(model.parameters())[:-1]])
print('Mean squared error (MSE) on test set:', mean_squared_error(y_test, y_pred))
print('Coefficient of determination (R^2) on test set:', r2_score(y_test, y_pred))
plt.scatter(x_test, y_test)
plt.plot(x_test, y_pred, color='red')
plt.show()
model = keras.Sequential([
layers.Dense(1, input_shape=[1])
])
optimizer = tf.keras.optimizers.SGD(0.03)
model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])
model.summary()
history = model.fit(x_train, y_train, epochs=10000, verbose=0)
df_history = pd.DataFrame(history.history)
df_history['epoch'] = history.epoch
df_history.tail()
y_pred = model.predict(x_test)
y_pred.shape
print('Coefficients:', model.layers[0].get_weights()[0][0])
print('Mean squared error (MSE) on test set:', mean_squared_error(y_test, y_pred))
print('Coefficient of determination (R^2) on test set:', r2_score(y_test, y_pred))
plt.scatter(x_test, y_test)
plt.plot(x_test, y_pred, color='red')
plt.show()