umma.dev

Machine Learning with Tensorflow

The basics of setting up Tensorflow, and an overview of a few machine learning problems and regression.

What is Tensorflow?

Tensorflow is a library for developing machine learning models.

Getting Started

Use this guide for a guide to set up.

Machine Learning Concepts with Examples

Image Classification

The code below is to train a neural network to classify types of images. The images used in the example below are from Fashion MNIST, and enable you to train a model to make distinctions between different types of clothing, tops, trousers etc.


import tensortflow as tf
import numpy as np
import matplotlib.pyplot as plt

fashion_mnsit = tf.keras.datasets.fashion_mnist
# train_images and train_labels arrays are the training set (model is tested against the test set)
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

train_images.shape
len(train_labels)
test_images.shape
len(test_images)

# preprocess the data
plt.figure()
plt.imshow(train_images[0])
plt.colorbar()
plt.grid(False)
plt.show()

# first image in training set (pixel vals) fall between 0 to 255
# need to scale to a range of 0 to 1 for model, hence divide by 255
train_images = train_images/255.0
test_images = test_images/255.0

# display first 25 images from training set with class name of each image to verify the data is in the correct format
plt.figure(figsize=(10,10))
for i in range(25):
  plt.subplot(5,5,i+1)
  plt.xticks([])
  plt.yticks([])
  plt.grid(False)
  plt.imshow(train_images[i], cmap=plt.cm.binary)
  plt.xlabel(class_names[train_labels[i]])
plt.show()

# build model
# step 1: set up layers (layers extract representations from the data fed into them)
model = tf.keras.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)), # transforms format of the image from two-dimensional array to one-dimensional
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10)
])

# before model is ready to train, during compile step: loss function, optimiser, metrics
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# step 2: train the model
# feed the training data into the model
# model learns association between images and labels
# ask model to make predictions about a test set
# vertify predictions match labels from test_labels array
model.fit(train_images, train_labels, epochs=10)
test_loss, test_acc = model.evaluate(test_images, test_labels, verbose=2)
print('\nTest accuract:', test_acc)
probability_model = tf.keras.Sequential([model, tf.leras.layers.Softmax()])
predictions = probability_model.predict(test_images)
# graph to look at the full set of 10 class predicitions
def plot_image(i, predictions_array, true_label, img):
  true_label, img = true_label[i], img[i]
  plt.grid(False)
  plt.xticks([])
  plt.yticks([])

  plt.imshow(img, cmap=plt.cm.binary)

  predicted_label = np.argmax(predictions_array)
  if predicted_label == true_label:
    color = 'blue'
  else:
    color = 'red'

  plt.xlabel("{}, {:2.0f}% ({})". format(class_names[predicted_label], 100*np.max(predictions_array). class_names[true_label]), color=color)

  def plot_value_array(i, predictions_array, true_label):
    true_label = true_label[i]
    plt.grid(False)
    plt.xticks(range(10))
    plt.yticks([])
    thisplot = plt.bar(range(10), predictions_array, color='#777777")
    plt.ylim([0, 1])
    predicted_label = np.argmax(predictions_array)

    thisplot[predicted_label].set_color('red')
    thisplot[true_label].set_color('blue')

    # verify predictions
    num_rows = 5
    num_cols = 3
    num_images = num_rows*num_cols
    plt.figure(figsize=(2*2*num_cols, 2*num_rows))
    for i in range(num_images):
      plt.subplot(num_rows, 2*nums_cols, 2*i+1)
      plot_images(i, predictions[i], test_labels, test_images)
      plt.subplot(num_rows, 2*num_cols, 2*i+2)
      plot_value_array(i, predictions[i], test_labels)
    plt.tight_layout()
    plt.show()

    img = (np.expand_dims(img,0))
    predictions_single = probability_model.predict(img)

    plot_value_array(1, predictions_single[0], test_labels)
    _ = plt.xticks(range(10), class_names, rotation=45)
    plt.show()
    np.argmax(predictions_single[0])

Text Classification

This example trains a binary classifier to analyse an IMDB dataset, which is a plain text file.

import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses

# download and extra dataset
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file("aclImdb_v1", url, untar=True, cache_dir='.', cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

os.listdir(dataset_dir)
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

# load the dataset off disk and prepare into a format suitable for training
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
# create a validation set using 80:20 split of the training data using the validation_split arg
batch_size = 32
seed = 42
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
  'aclImdb/train',
  batch_size=batch_size,
  validation_split=0.2,
  subset='training',
  seed=seed
)
raw_vals_ds = tf.keras.utils.text_dataset_from_directory(
  'aclImdb/train;,
  batch_size=batch_size,
  validation_split=0.2
  seed=seed
)
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
  'aclImdb/test',
  batch_size=batch_size
)

# prepare dataset for training
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ' )
  return tf.strings.regex_replace(stripped_html, '[%s]', % re.escaoe(string.punctuation), '')

  # create a TextVectorization layer
  max_features = 10000
  sequence_length = 250

  vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='init',
    output_sequence_length=sequence_length
  )

  # make a text only dataset (without labels)
  train_text = raw_train_ds.map(lambda x, y: x)
  vectorize_layer.adapt(train_text)

def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

  text_batch, label_batch = next(iter(raw_train_ds))
  first_review, first_label = text_batch[0], label_batch[0]
  print("Review", first_review)
  print("Label", raw_train_ds.class_names[first_label])
  print("Vectorized review", vectorize_text(first_review, first_label))

  train_ds = raw_train_ds.map(vectorize_text)
  val_ds = raw_val_ds.map(vectorize_text)
  test_ds = raw_test_ds.map(vectorize_text)

  # configure the dataset for performance - .cache() to keep memory after loaded off disk and .prefetch overlaps data preprocessing and model execution while training
  AUTOTUNE = tf.data.AUTOTUNE

  train_ds = train_ds.cache().prefetch(buffer_size=ATUOTUNE)
  val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
  test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

  # create model
  embedding_dim = 16
  model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, embedding_dim),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1)
  ])
  mmodel.summary()

  mode.compilte(loss=losses.BinaryCrossentropy(from_logits=True), optimizer='adam', metrics=metrics.BinaryAccuracy(threshold=0.0))

  # train model
  epochs = 10
  history = model.fit(
    train_ds,
    validaton_data=vals_ds,
    epochs=epochs)
  loss, accuracy = model.evaluate(test_ds)
  history_dict = history.history
  history.dict.keys()

  acc = history_dict['binary_accuracy']
  val_acc = history_dict['val_binary_accuracy']
  loss = history_dict['loss']
  val_loss = history_dict['val_loss']
  epochs = range(1, len(acc) + 1)
  # plotting training loss
  # bo is for blue dot
  plt.plot(epochs, loss 'bo', label='Training loss')
  # b is for solid blue line
  plt.plot(epochs, val_lossm 'b', label='Validation loss')
  plt.title('Training and validation loss')
  plt.xlabel('Epochs')
  plt.ylabel('Loss')
  plt.legend()
  plt.show()
  # plotting training and validation accuract
  plt.plot(epochs, acc, 'bo', label='Training acc')
  plt.plot(epochs, val_acc, 'b', label='Validation acc')
  plt.title('Training and validation accuracy')
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy')
  plt.lengend(loc='lower right')
  plt.show()

  # export model
  export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation('sigmoid')
  ])
  export_mode.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
  )
  # Test with raw_test_ds
  loss, accuracy = export_model.evaluate(raw_tesT_ds)
  print(accuracy)

Regression

Regression problems predict outcomes of continuous value like price or probability. The aim of a classification problemis to select a class froma list of classes.

This example uses the Autp MPG dataset.

pip install -q seaborn
import matplotlib.pylot as plt
import numpy as np
import pandas as pd
import seaborn as sns

np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from tensorflow import keras
from tensorflow import layers

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)

dataset = raw_dataset.copy()
dataset.tail()
datset.isna().sum()
dataset = dataset.dropna()
dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
dataset.tail()

train_dataset = dataset.sample(frac=0.8, random_state=0)
train_dataset = dataset.drop(train_dataset.index)

# inspect data
sns.pairplot(train_dataset[['MPG', 'Cyclinders', 'Displacement', 'Weight']], diag_kind='kde')

# split featuers from label
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train.features.pop('MPG')
test_labels = test_featuers.pop('MPG')

# normalisation
train_dataset.describe().traspose()[['mean', 'std']]
noramlizer = tf.keras.layers.Noramlization(axis=-1)
noramlizer.adapt(np.array(train_features))
print(normalizer.mean.numpy())
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
  print('First example:', first)
  print()
  print('Noramlized:', noramlizer(first).numpy())

# linear regression (one variable)
horsepower = np.array(train_features['Horsepower'])

horsepower_normalizer = layers.Noramalization(input_shape=[1,], axis=None)
horsepower_normalizer.adapt(horsepower)

horsepower_model = tf.keras.Sequential([
  horsepower_normalizer,
  layer.Dense(units=1)
])
horsepower_model.summary()
horsepower_model.predict(horsepower[:10])
horsepower_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
  loss='mean_absolute_error'
)
%%time
history = horsepower_model.fit(
  train_features['Horsepower'],
  train_labels,
  epochs=100,
  verbose=0,
  validation_split = 0.2
)
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss']. label='val_loss')
  plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error [MPG]')
  plt.legend()
  plt.grid(True)
plot_loss(history)

test_results = {}
test_results['horsepower_model'] = horsepower_model.evaluate(
  test_features['Horsepower'],
  test_labels, verbose=0
)
x = tf.linspace(0.0, 250, 251)
y = horsepower_model.predict(x)

def plot_horsepower(x, y):
  plt.scatter(train_features['Horsepower'], train_labels, label='Data')
  plt.plot(x, y, color='k', label='Predictions')
  plt.xlabel('Horsepower')
  plt.ylabel('MPG')
  plt.legned()
plot_horsepower(x,y)