There were numerous columns with data I didn't need, so I had to drop these. The data I was interested in was all in one column, so the first x rows had data on average earnings, the next x rows had data on average earnings for males, the next x rows for females, the next x rows had data on years spent in education and so on. To test for linear correlation I needed the data in adjacent columns not all in one column. The following script was used to reshape the data. I extracted the data I needed into separate dataframes then combined these dataframes as necessary:

import pandas as pd

import matplotlib.pyplot as plt

from matplotlib import cm as cm

def plot_corr(df,size=4):

'''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

Input:

df: pandas DataFrame

size: vertical and horizontal size of the plot'''

corr = df.corr()

fig, ax = plt.subplots(figsize=(size, size))

ax.matshow(corr,cmap=cm.Greys)

plt.xticks(range(len(corr.columns)), corr.columns);

plt.yticks(range(len(corr.columns)), corr.columns);

plt.show()

#import data

df = pd.read_csv('oecd.csv')

#features available

'''

print(df['Indicator'].unique())

['Dwellings without basic facilities' 'Housing expenditure'

'Rooms per person' 'Household net adjusted disposable income'

'Household net financial wealth' 'Employment rate'

'Long-term unemployment rate' 'Personal earnings'

'Quality of support network' 'Educational attainment' 'Student skills'

'Years in education' 'Air pollution' 'Water quality' 'Voter turnout'

'Life expectancy' 'Self-reported health' 'Life satisfaction'

'Homicide rate' 'Employees working very long hours'

'Time devoted to leisure and personal care' 'Labour market insecurity'

'Stakeholder engagement for developing regulations'

'Feeling safe walking alone at night']

'''

#format data

gender = 'Total' #there are t 3 values: total, male and female

df_edu = df[(df['Indicator']=='Years in education') & (df['Inequality']==gender)]

df_earn = df[(df['Indicator']=='Personal earnings') & (df['Inequality']==gender)]

df_satis = df[(df['Indicator']=='Life satisfaction') & (df['Inequality']==gender)]

df_person = df[(df['Indicator']=='Time devoted to leisure and personal care') & (df['Inequality']==gender)]

drop=['LOCATION','INDICATOR','Indicator','MEASURE','Measure','INEQUALITY','Inequality','Unit Code','Unit','PowerCode Code','PowerCode','Reference Period Code', 'Reference Period', 'Flag Codes', 'Flags']

df_edu = df_edu.drop(drop,1)

df_edu = df_edu.rename(columns={'Value': 'Edu'})

df_earn = df_earn.drop(drop,1)

df_earn = df_earn.rename(columns={'Value': 'Earn'})

df_satis = df_satis.drop(drop,1)

df_satis = df_satis.rename(columns={'Value': 'Satis'})

df_person = df_person.drop(drop,1)

df_person = df_person.rename(columns={'Value': 'Person'})

df_combine = df_edu.merge(df_earn, on='Country',how='left')

df_combine = df_combine.merge(df_satis, on='Country',how='left')

df_combine = df_combine.merge(df_person, on='Country',how='left')

df_combine = df_combine.drop('Country',1)

#print(df_combine.corr(method='spearman'))

#print(df_combine.head())

#visualise

#plot_corr(df_combine)

df_combine.plot(kind='scatter',y='Satis',x='Earn')

plt.show()

]]>import pandas as pd

import matplotlib.pyplot as plt

from matplotlib import cm as cm

def plot_corr(df,size=4):

'''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

Input:

df: pandas DataFrame

size: vertical and horizontal size of the plot'''

corr = df.corr()

fig, ax = plt.subplots(figsize=(size, size))

ax.matshow(corr,cmap=cm.Greys)

plt.xticks(range(len(corr.columns)), corr.columns);

plt.yticks(range(len(corr.columns)), corr.columns);

plt.show()

#import data

df = pd.read_csv('oecd.csv')

#features available

'''

print(df['Indicator'].unique())

['Dwellings without basic facilities' 'Housing expenditure'

'Rooms per person' 'Household net adjusted disposable income'

'Household net financial wealth' 'Employment rate'

'Long-term unemployment rate' 'Personal earnings'

'Quality of support network' 'Educational attainment' 'Student skills'

'Years in education' 'Air pollution' 'Water quality' 'Voter turnout'

'Life expectancy' 'Self-reported health' 'Life satisfaction'

'Homicide rate' 'Employees working very long hours'

'Time devoted to leisure and personal care' 'Labour market insecurity'

'Stakeholder engagement for developing regulations'

'Feeling safe walking alone at night']

'''

#format data

gender = 'Total' #there are t 3 values: total, male and female

df_edu = df[(df['Indicator']=='Years in education') & (df['Inequality']==gender)]

df_earn = df[(df['Indicator']=='Personal earnings') & (df['Inequality']==gender)]

df_satis = df[(df['Indicator']=='Life satisfaction') & (df['Inequality']==gender)]

df_person = df[(df['Indicator']=='Time devoted to leisure and personal care') & (df['Inequality']==gender)]

drop=['LOCATION','INDICATOR','Indicator','MEASURE','Measure','INEQUALITY','Inequality','Unit Code','Unit','PowerCode Code','PowerCode','Reference Period Code', 'Reference Period', 'Flag Codes', 'Flags']

df_edu = df_edu.drop(drop,1)

df_edu = df_edu.rename(columns={'Value': 'Edu'})

df_earn = df_earn.drop(drop,1)

df_earn = df_earn.rename(columns={'Value': 'Earn'})

df_satis = df_satis.drop(drop,1)

df_satis = df_satis.rename(columns={'Value': 'Satis'})

df_person = df_person.drop(drop,1)

df_person = df_person.rename(columns={'Value': 'Person'})

df_combine = df_edu.merge(df_earn, on='Country',how='left')

df_combine = df_combine.merge(df_satis, on='Country',how='left')

df_combine = df_combine.merge(df_person, on='Country',how='left')

df_combine = df_combine.drop('Country',1)

#print(df_combine.corr(method='spearman'))

#print(df_combine.head())

#visualise

#plot_corr(df_combine)

df_combine.plot(kind='scatter',y='Satis',x='Earn')

plt.show()

The darker the shade of grey the stronger the correlation. The top left to bottom right diagonal can be ignored, this is comparing the same fields so equals 1.

Satis = life satisfaction

earn = personal earnings

person = available personal time (to pursue hobbies, relax and so on)

Edu = time spent in full time education

import pandas as pd

import matplotlib.pyplot as plt

from matplotlib import cm as cm

def plot_corr(df,size=4):

'''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

Input:

df: pandas DataFrame

size: vertical and horizontal size of the plot'''

corr = df.corr()

fig, ax = plt.subplots(figsize=(size, size))

ax.matshow(corr,cmap=cm.Greys)

plt.xticks(range(len(corr.columns)), corr.columns);

plt.yticks(range(len(corr.columns)), corr.columns);

plt.show()

#import data

df = pd.read_csv('oecd.csv')

#features available

'''

print(df['Indicator'].unique())

['Dwellings without basic facilities' 'Housing expenditure'

'Rooms per person' 'Household net adjusted disposable income'

'Household net financial wealth' 'Employment rate'

'Long-term unemployment rate' 'Personal earnings'

'Quality of support network' 'Educational attainment' 'Student skills'

'Years in education' 'Air pollution' 'Water quality' 'Voter turnout'

'Life expectancy' 'Self-reported health' 'Life satisfaction'

'Homicide rate' 'Employees working very long hours'

'Time devoted to leisure and personal care' 'Labour market insecurity'

'Stakeholder engagement for developing regulations'

'Feeling safe walking alone at night']

'''

#format data

gender = 'Total'

df_edu = df[(df['Indicator']=='Years in education') & (df['Inequality']==gender)]

df_earn = df[(df['Indicator']=='Personal earnings') & (df['Inequality']==gender)]

df_satis = df[(df['Indicator']=='Life satisfaction') & (df['Inequality']==gender)]

df_person = df[(df['Indicator']=='Time devoted to leisure and personal care') & (df['Inequality']==gender)]

drop = ['LOCATION','INDICATOR','Indicator','MEASURE','Measure','INEQUALITY','Inequality','Unit Code','Unit','PowerCode Code','PowerCode','Reference Period Code', 'Reference Period', 'Flag Codes', 'Flags']

df_edu = df_edu.drop(drop,1)

df_edu = df_edu.rename(columns={'Value': 'Edu'})

df_earn = df_earn.drop(drop,1)

df_earn = df_earn.rename(columns={'Value': 'Earn'})

df_satis = df_satis.drop(drop,1)

df_satis = df_satis.rename(columns={'Value': 'Satis'})

df_person = df_person.drop(drop,1)

df_person = df_person.rename(columns={'Value': 'Person'})

df_combine = df_edu.merge(df_earn, on='Country',how='left')

df_combine = df_combine.merge(df_satis, on='Country',how='left')

df_combine = df_combine.merge(df_person, on='Country',how='left')

df_combine = df_combine.drop('Country',1)

#print(df_combine.corr(method='spearman'))

#print(df_combine.head())

#visualise

#plot_corr(df_combine)

df_combine.plot(kind='scatter',y='Satis',x='Earn')

plt.show()

]]>Satis = life satisfaction

earn = personal earnings

person = available personal time (to pursue hobbies, relax and so on)

Edu = time spent in full time education

import pandas as pd

import matplotlib.pyplot as plt

from matplotlib import cm as cm

def plot_corr(df,size=4):

'''Function plots a graphical correlation matrix for each pair of columns in the dataframe.

Input:

df: pandas DataFrame

size: vertical and horizontal size of the plot'''

corr = df.corr()

fig, ax = plt.subplots(figsize=(size, size))

ax.matshow(corr,cmap=cm.Greys)

plt.xticks(range(len(corr.columns)), corr.columns);

plt.yticks(range(len(corr.columns)), corr.columns);

plt.show()

#import data

df = pd.read_csv('oecd.csv')

#features available

'''

print(df['Indicator'].unique())

['Dwellings without basic facilities' 'Housing expenditure'

'Rooms per person' 'Household net adjusted disposable income'

'Household net financial wealth' 'Employment rate'

'Long-term unemployment rate' 'Personal earnings'

'Quality of support network' 'Educational attainment' 'Student skills'

'Years in education' 'Air pollution' 'Water quality' 'Voter turnout'

'Life expectancy' 'Self-reported health' 'Life satisfaction'

'Homicide rate' 'Employees working very long hours'

'Time devoted to leisure and personal care' 'Labour market insecurity'

'Stakeholder engagement for developing regulations'

'Feeling safe walking alone at night']

'''

#format data

gender = 'Total'

df_edu = df[(df['Indicator']=='Years in education') & (df['Inequality']==gender)]

df_earn = df[(df['Indicator']=='Personal earnings') & (df['Inequality']==gender)]

df_satis = df[(df['Indicator']=='Life satisfaction') & (df['Inequality']==gender)]

df_person = df[(df['Indicator']=='Time devoted to leisure and personal care') & (df['Inequality']==gender)]

drop = ['LOCATION','INDICATOR','Indicator','MEASURE','Measure','INEQUALITY','Inequality','Unit Code','Unit','PowerCode Code','PowerCode','Reference Period Code', 'Reference Period', 'Flag Codes', 'Flags']

df_edu = df_edu.drop(drop,1)

df_edu = df_edu.rename(columns={'Value': 'Edu'})

df_earn = df_earn.drop(drop,1)

df_earn = df_earn.rename(columns={'Value': 'Earn'})

df_satis = df_satis.drop(drop,1)

df_satis = df_satis.rename(columns={'Value': 'Satis'})

df_person = df_person.drop(drop,1)

df_person = df_person.rename(columns={'Value': 'Person'})

df_combine = df_edu.merge(df_earn, on='Country',how='left')

df_combine = df_combine.merge(df_satis, on='Country',how='left')

df_combine = df_combine.merge(df_person, on='Country',how='left')

df_combine = df_combine.drop('Country',1)

#print(df_combine.corr(method='spearman'))

#print(df_combine.head())

#visualise

#plot_corr(df_combine)

df_combine.plot(kind='scatter',y='Satis',x='Earn')

plt.show()

We are still a long way from the AI of science fiction stories which rises-up and builds an army of human hating killing machines. The two most widespread forms of AI we have are machine learning (ML) and deep learning (DL). Both are examples of narrow AI that is they may be able outperform humans in one narrow task but unlike the multi functional human brain they are unable to carry out unrelated tasks, to reason or be self-aware.

Our world is increasingly controlled by Artificial Intelligence (AI), and it is not just the self driving cars and chess playing computers that the media often report on. Google, Amazon, Facebook, your bank, the government, security/intelligence organizations, the police and others all use AI. Often this is Machine Learning (ML) but increasingly Deep Learning (DL) is being used to solve a diverse range of problems including medical issues, Amazon’s Echo and business applications.

But what is it? Is it just another form of machine learning? Though they have some things in common DL and ML are not the same.

To illustrate how DL works I will use a basic DL script written in Python, the script is available below. The problem we are going to attempt to solve involves creating a model which divides data into catagories. This type of problem is common in the real world, for example customers who renew their subscription/contract versus those who don’t, computer activity which is suspicious or benign, a fuzzy patch on an MRI scan that might be benign or might be cancer. The script can generate three types of dataset. The first, called moons, can be plotted on a 2 dimensional graph:

The two classes are coloured red and blue. Our model will be represented by a decision boundary, a line, on the graph which separates the blue and red dots. For any new data point we will then be able to use our model to determine if it belongs to the reds or the blues. As you can see a simple straight line will not be a sufficient to solve the problem:

Feeding the data into the DL script generates the following solution:

The solution is not perfect, but it is pretty good. Note that we are not trying to get every single point on the correct side of the line – that situation is called over fitting, it is not a good thing. Real world data often contains outliers and noise, we don’t necessarily want to include this in our model.

**The Script**

import numpy as np

import sklearn

from sklearn import datasets, linear_model

import matplotlib.pyplot as plt

def initialise(a,b,c,d):

nn_input_dim = a #number of input nodes

nn_output_dim = b #number of output nodes

epsilon = c # learning rate for gradient descent

reg_lambda = d # regularization strength

return nn_input_dim, nn_output_dim, epsilon, reg_lambda

def generate_data(data_type):

np.random.seed(0)

if data_type == 'moons':

X, y = datasets.make_moons(200, noise=0.20)

elif data_type == 'circles':

X, y = sklearn.datasets.make_circles(200, noise=0.20)

elif data_type == 'blobs':

X, y = sklearn.datasets.make_blobs(centers=2, random_state=0)

return X, y

def visualize(X, y, model):

plot_decision_boundary(lambda x:predict(model,x), X, y)

def plot_decision_boundary(pred_func, X, y):

# Set min and max values and give it some padding

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5

y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

h = 0.01

# Generate a grid of points with distance h between them

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict the function value for the whole gid

Z = pred_func(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)

# Plot the contour and training examples

plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)

plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)

plt.show()

# Helper function to evaluate the total loss on the dataset

def calculate_loss(model, X, y):

num_examples = len(X) # training set size

W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']

# Forward propagation to calculate our predictions

z1 = X.dot(W1) + b1

a1 = np.tanh(z1)

z2 = a1.dot(W2) + b2

exp_scores = np.exp(z2)

probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

# Calculating the loss

corect_logprobs = -np.log(probs[range(num_examples), y])

data_loss = np.sum(corect_logprobs)

# Add regulatization term to loss (optional)

data_loss += reg_lambda / 2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))

return 1. / num_examples * data_loss

def predict(model, x):

W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']

# Forward propagation

z1 = x.dot(W1) + b1

a1 = np.tanh(z1)

z2 = a1.dot(W2) + b2

exp_scores = np.exp(z2)

probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

return np.argmax(probs, axis=1)

# This function learns parameters for the neural network and returns the model.

# - nn_hdim: Number of nodes in the hidden layer

# - num_passes: Number of passes through the training data for gradient descent

# - print_loss: If True, print the loss every 1000 iterations

def build_model(X, y, nn_hdim, num_passes=20000, print_loss=False):

# Initialize the parameters to random values. We need to learn these.

num_examples = len(X)

np.random.seed(0)

W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)

b1 = np.zeros((1, nn_hdim))

W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)

b2 = np.zeros((1, nn_output_dim))

# This is what we return at the end

model = {}

# Gradient descent. For each batch...

for i in range(0, num_passes):

# Forward propagation

z1 = X.dot(W1) + b1

a1 = np.tanh(z1)

z2 = a1.dot(W2) + b2

exp_scores = np.exp(z2)

probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

# Backpropagation

delta3 = probs

delta3[range(num_examples), y] -= 1

dW2 = (a1.T).dot(delta3)

db2 = np.sum(delta3, axis=0, keepdims=True)

delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))

dW1 = np.dot(X.T, delta2)

db1 = np.sum(delta2, axis=0)

# Add regularization terms (b1 and b2 don't have regularization terms)

dW2 += reg_lambda * W2

dW1 += reg_lambda * W1

# Gradient descent parameter update

W1 += -epsilon * dW1

b1 += -epsilon * db1

W2 += -epsilon * dW2

b2 += -epsilon * db2

# Assign new parameters to the model

model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

# Optionally print the loss.

# This is expensive because it uses the whole dataset, so we don't want to do it too often.

if print_loss and i % 1000 == 0:

print("Loss after iteration %i: %f" % (i, calculate_loss(model, X, y)))

return model

nn_input_dim, nn_output_dim, epsilon, reg_lambda = initialise(2,2,0.01,0.01)

X, y = generate_data('moons')

model = build_model(X, y, 4, 10000, print_loss=True)

visualize(X, y, model)

Note: this script is derived from:

https://github.com/dennybritz/nn-from-scratch/blob/master/ann_classification.py

]]>import sklearn

from sklearn import datasets, linear_model

import matplotlib.pyplot as plt

def initialise(a,b,c,d):

nn_input_dim = a #number of input nodes

nn_output_dim = b #number of output nodes

epsilon = c # learning rate for gradient descent

reg_lambda = d # regularization strength

return nn_input_dim, nn_output_dim, epsilon, reg_lambda

def generate_data(data_type):

np.random.seed(0)

if data_type == 'moons':

X, y = datasets.make_moons(200, noise=0.20)

elif data_type == 'circles':

X, y = sklearn.datasets.make_circles(200, noise=0.20)

elif data_type == 'blobs':

X, y = sklearn.datasets.make_blobs(centers=2, random_state=0)

return X, y

def visualize(X, y, model):

plot_decision_boundary(lambda x:predict(model,x), X, y)

def plot_decision_boundary(pred_func, X, y):

# Set min and max values and give it some padding

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5

y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

h = 0.01

# Generate a grid of points with distance h between them

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Predict the function value for the whole gid

Z = pred_func(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)

# Plot the contour and training examples

plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)

plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)

plt.show()

# Helper function to evaluate the total loss on the dataset

def calculate_loss(model, X, y):

num_examples = len(X) # training set size

W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']

# Forward propagation to calculate our predictions

z1 = X.dot(W1) + b1

a1 = np.tanh(z1)

z2 = a1.dot(W2) + b2

exp_scores = np.exp(z2)

probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

# Calculating the loss

corect_logprobs = -np.log(probs[range(num_examples), y])

data_loss = np.sum(corect_logprobs)

# Add regulatization term to loss (optional)

data_loss += reg_lambda / 2 * (np.sum(np.square(W1)) + np.sum(np.square(W2)))

return 1. / num_examples * data_loss

def predict(model, x):

W1, b1, W2, b2 = model['W1'], model['b1'], model['W2'], model['b2']

# Forward propagation

z1 = x.dot(W1) + b1

a1 = np.tanh(z1)

z2 = a1.dot(W2) + b2

exp_scores = np.exp(z2)

probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

return np.argmax(probs, axis=1)

# This function learns parameters for the neural network and returns the model.

# - nn_hdim: Number of nodes in the hidden layer

# - num_passes: Number of passes through the training data for gradient descent

# - print_loss: If True, print the loss every 1000 iterations

def build_model(X, y, nn_hdim, num_passes=20000, print_loss=False):

# Initialize the parameters to random values. We need to learn these.

num_examples = len(X)

np.random.seed(0)

W1 = np.random.randn(nn_input_dim, nn_hdim) / np.sqrt(nn_input_dim)

b1 = np.zeros((1, nn_hdim))

W2 = np.random.randn(nn_hdim, nn_output_dim) / np.sqrt(nn_hdim)

b2 = np.zeros((1, nn_output_dim))

# This is what we return at the end

model = {}

# Gradient descent. For each batch...

for i in range(0, num_passes):

# Forward propagation

z1 = X.dot(W1) + b1

a1 = np.tanh(z1)

z2 = a1.dot(W2) + b2

exp_scores = np.exp(z2)

probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

# Backpropagation

delta3 = probs

delta3[range(num_examples), y] -= 1

dW2 = (a1.T).dot(delta3)

db2 = np.sum(delta3, axis=0, keepdims=True)

delta2 = delta3.dot(W2.T) * (1 - np.power(a1, 2))

dW1 = np.dot(X.T, delta2)

db1 = np.sum(delta2, axis=0)

# Add regularization terms (b1 and b2 don't have regularization terms)

dW2 += reg_lambda * W2

dW1 += reg_lambda * W1

# Gradient descent parameter update

W1 += -epsilon * dW1

b1 += -epsilon * db1

W2 += -epsilon * dW2

b2 += -epsilon * db2

# Assign new parameters to the model

model = {'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2}

# Optionally print the loss.

# This is expensive because it uses the whole dataset, so we don't want to do it too often.

if print_loss and i % 1000 == 0:

print("Loss after iteration %i: %f" % (i, calculate_loss(model, X, y)))

return model

nn_input_dim, nn_output_dim, epsilon, reg_lambda = initialise(2,2,0.01,0.01)

X, y = generate_data('moons')

model = build_model(X, y, 4, 10000, print_loss=True)

visualize(X, y, model)

Note: this script is derived from:

https://github.com/dennybritz/nn-from-scratch/blob/master/ann_classification.py

import pandas as pd

import plotly.offline as py

import numpy as np

df = pd.read_csv('VietnamConflict.csv')

df_states = pd.read_csv('states.csv') #approx 1967 population by state

death_by_state = df['STATE_CODE'].value_counts().reset_index().rename(columns={'index': 'STATE_CODE', 'STATE_CODE': 'COUNT'})

df_normalised = pd.merge(df_states,death_by_state,on='STATE_CODE',how='left')

df_normalised['NORMALISED_COUNT'] = df_normalised['COUNT']/df_normalised['POPULATION']

df_normalised['NORMALISED_COUNT'] = df_normalised['NORMALISED_COUNT'].round()

for col in df_normalised.columns:

df_normalised[col] = df_normalised[col].astype(str)

scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\

[0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

labels = df_normalised['STATE_CODE']

values = df_normalised['COUNT']

data = [ dict(

type='choropleth',

colorscale = scl,

autocolorscale = False,

locations = labels,

z = np.array(values).astype(float),

locationmode = 'USA-states',

text = labels,

marker = dict(

line = dict (

color = 'rgb(255,255,255)',

width = 2

) ),

colorbar = dict(

title = "US casualties")

) ]

layout = dict(

#title = 'US casualties in Vietnam war<br>(Normalised by approximate 1967 state pop)',

title = 'US casualties in Vietnam war',

geo = dict(

scope='usa',

projection=dict( type='albers usa' ),

showlakes = True,

lakecolor = 'rgb(255, 255, 255)'),

)

fig = dict( data=data, layout=layout )

py.plot( fig, filename='US_Vietnam_war_casualties.html' )

]]>

Our goal is to write code code that will take some details on a passenger and predict if he/she died or survived.

Machine learning attempts to build a data model based on features of the data, for example did the passenger have a first, second or third class ticket, was the passenger male or female and so on. The model can then be used to predict the outcome for a given passenger or group of passengers. This dataset serves to illustrate some of the features of machine learning, but remember the code could with very little alteration also handle other data for example predicting if someone will develop diabetes.

The first thing we need is a labelled training data set. The algorithm learns from this data then applies what it learnt to new data (test data). This problem is a classification problem as every passenger fell into one of two categories - they survived or they died so the 3 machine learning techniques used below are classification techniques, they are also supervised learning techniques which means we need to break the original data set into a training dataset and a test dataset

#the script was written and tested using idle, it should be compatible with both python 2 and 3

#import the necessary libraries

import pandas as pd

from sklearn import tree, preprocessing

import sklearn.ensemble as ske

from sklearn.model_selection import train_test_split

#read the data into a pandas dataframe

df = pd.read_csv('titanic_data.csv')

#drop fields which have lots of missing data, then drop rows with missing data

df = df.drop(['body','cabin','boat','home.dest','name','ticket'],axis=1)

df = df.dropna()

#machine learning needs numerical values not strings

le = preprocessing.LabelEncoder()

df.sex = le.fit_transform(df.sex)

df.embarked = le.fit_transform(df.embarked)

'''

a row from the original data looked like:

pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest

1 1 Allen, Miss. Elisabeth Walton female 29 0 0 24160 211.3375 B5 S 2 St Louis, MO

a typical row now looks like:

pclass survived sex age sibsp parch fare embarked

1 1 0 29.0000 0 0 211.3375 2

'''

#create two new numpy arrays, X has the survived column values removed and y is only the survived column values

X = df.drop(['survived'], axis=1).values

y = df['survived'].values

#we are using supervised learning so we need training and test data, the test_size parameter determines the relative sizes of the traing and test data sets

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0)

#use three different approaches and print out the success rate (1 = 100%), you can vary the parameters below and the test_size parameter above to try to

#improve success rate

clf_dt = tree.DecisionTreeClassifier(max_depth=10)

clf_dt = clf_dt.fit(X_train, y_train)

print(clf_dt.score(X_test,y_test))

clf_rf = ske.RandomForestClassifier(n_estimators=50)

clf_rf = clf_rf.fit(X_train, y_train)

print(clf_rf.score(X_test,y_test))

clf_gb = ske.GradientBoostingClassifier(n_estimators=50)

clf_gb = clf_gb.fit(X_train, y_train)

print(clf_gb.score(X_test,y_test))

'''

I found the gradient boosting technique gave the best results (about 82% accuracy) and the decision

tree gave the worst results

'''

]]>#the script was written and tested using idle, it should be compatible with both python 2 and 3

#import the necessary libraries

import pandas as pd

from sklearn import tree, preprocessing

import sklearn.ensemble as ske

from sklearn.model_selection import train_test_split

#read the data into a pandas dataframe

df = pd.read_csv('titanic_data.csv')

#drop fields which have lots of missing data, then drop rows with missing data

df = df.drop(['body','cabin','boat','home.dest','name','ticket'],axis=1)

df = df.dropna()

#machine learning needs numerical values not strings

le = preprocessing.LabelEncoder()

df.sex = le.fit_transform(df.sex)

df.embarked = le.fit_transform(df.embarked)

'''

a row from the original data looked like:

pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest

1 1 Allen, Miss. Elisabeth Walton female 29 0 0 24160 211.3375 B5 S 2 St Louis, MO

a typical row now looks like:

pclass survived sex age sibsp parch fare embarked

1 1 0 29.0000 0 0 211.3375 2

'''

#create two new numpy arrays, X has the survived column values removed and y is only the survived column values

X = df.drop(['survived'], axis=1).values

y = df['survived'].values

#we are using supervised learning so we need training and test data, the test_size parameter determines the relative sizes of the traing and test data sets

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=0)

#use three different approaches and print out the success rate (1 = 100%), you can vary the parameters below and the test_size parameter above to try to

#improve success rate

clf_dt = tree.DecisionTreeClassifier(max_depth=10)

clf_dt = clf_dt.fit(X_train, y_train)

print(clf_dt.score(X_test,y_test))

clf_rf = ske.RandomForestClassifier(n_estimators=50)

clf_rf = clf_rf.fit(X_train, y_train)

print(clf_rf.score(X_test,y_test))

clf_gb = ske.GradientBoostingClassifier(n_estimators=50)

clf_gb = clf_gb.fit(X_train, y_train)

print(clf_gb.score(X_test,y_test))

'''

I found the gradient boosting technique gave the best results (about 82% accuracy) and the decision

tree gave the worst results

'''

Mon | Tues | Wed | Thur | Fri | Sat | Sun |

243 | 84 | 74 | 51 | 55 | 60 | 56 |

31 | 34 | 33 | 33 | 88 | 48 | 58 |

69 | 43 | 48 | 86 | 60 | 83 | 77 |

100 | 82 | 86 | 93 | 86 | 96 | 62 |

44 | 72 | 81 | 29 | 20 | 30 | 28 |

37 | 33 | 24 | 26 | 35 | 27 | 95 |

185 | 111 | 145 | 136 | 162 | 153 | 175 |

132 | 261 | 206 | 336 | 183 | 171 | 158 |

150 | 160 | 402 | 90 | 69 | 143 | 88 |

56 | 71 | 47 | 47 | 42 | 324 | 40 |

41 | 36 | 47 | 76 | 164 | 65 | 44 |

The amount of data is small but it is still difficult to take in everything when it is presented as a table. But if we plot the data as a line graph with day number on the x-axis and number of customers on the y-axis:

Straight away we can see that we can divide the data into pre-40 days and post-40 days. With much more activity including some prominent spikes in the post-40 day data. Plotting the data makes the story much easier to see. We can't say why there is a difference, there is nothing in the data we have that could answer that question. A more technical plot is the box and whisker plot. It is less user friendly than a simple line graph but does give more information on the median, quartile and range of the data.

The diagram below explains the different features in the above plot. Note the above plot also contains dots which indicate outliers in the data. Each box represents a day of the week starting at Monday.

I would never use a box and whisker plot in a report or presentation aimed at people who are not familiar with statistics.

One more possible plot is the waterfall graph:

One more possible plot is the waterfall graph:

This plot starts at the beginning of week three. Each bar represents the increase or decrease in number of customers from the previous day. It is similar to the line graph above but can be used to highlight certain events/days. For example say one day we are expecting customer numbers to increase but instead we see a decline - the waterfall graph can illustrate this clearly:

Other common graphs include the bar graph and the histogram, see here for a tutorial on these.

]]>Fraternal twins can be mm, mf, fm or ff (where m = male and f = female), identical twins can only be mm, or ff.

For the sake of this example let's say the probability of each option is equal, so P(mm) = P(mf) = P(fm) = P(ff) = 0.25 for Fraternal twins and P(mm) = P(ff) = 0.5 for identical twins. The probability that twins are identical is P(I) = 0.1 so P(F) = 0.9 (probability of Fraternal), assuming twins must be either identical or fraternal (not strictly true but let's not make things too complicated).

If we have two brothers who are twins what is the probability that they are identical twins?

The non-Baysean answer might be 0.1 or 10%. But this is incorrect. The Baysean formula gives the correct answer:

The probability of identical twins given that both twins are brothers written as P(I|B) = P(B|I)P(I)/P(B)

and since we are assuming twins must be either identical or fraternal then: P(B) = P(B|I)P(I) + P(B|F)P(F)

substituting this into the above gives: P(I|B) = P(B|I)P(I)/P(B|I)P(I) + P(B|F)P(F)

then putting in the numbers gives (0.5 x 0.1)/((0.5 x 0.1) + (0.25 x 0.9)) = 2/11 (about 18.2%) - so the knowledge that both twins are male makes the probability they are identical higher.]]>

weight | height | age |

58 | 1.56 | 23 |

87 | 1.82 | 56 |

94 | 2.01 | 33 |

We could plot this data on a three dimensional grid, we can also represent each row with a list of three values, for example the first row becomes: [58,1.56,23]. This list is a vector. Vectors can be used to represent all kinds of data including natural language, images and so on.

]]>for example for a given dataframe (df) which has a gender column containing either 'male' or 'female' it is possible to create a new dataframe (df_male) that contains only males:

df_male = df[df['Gender'=='male']]

In the examples below I wanted to create dataframes were there was more than one possible value. In this case I can create a list and use the .isin() function to select the rows.

import pandas as pd

lessThanFive = ['Less than a year','1 to 2 years','2 to 3 years','3 to 4 years','4 to 5 years','5 to 6 years']

sixToTen = ['6 to 7 years','7 to 8 years','8 to 9 years','9 to 10 years','10 to 11 years']

elevenTofifteen = ['11 to 12 years','12 to 13 years','13 to 14 years','14 to 15 years']

moreThanFifteen = ['15 to 16 years','16 to 17 years','17 to 18 years','18 to 19 years','19 to 20 years','20 or more years']

keys_2017 = ['lessThanFive','sixToTen','elevenTofifteen','moreThanFifteen']

keys_dict = {'lessThanFive':lessThanFive, 'sixToTen':sixToTen, 'elevenTofifteen':elevenTofifteen, 'moreThanFifteen':moreThanFifteen}

dict_2017_tabs = {}

def get_mean_salary(name,key):

salary = 0

key_list = keys_dict[key]

if name == 'df_tabs':

df_mean = df_tabs[df_tabs['YearsProgram'].isin(key_list)]

elif name == 'df_spaces':

df_mean = df_spaces[df_spaces['YearsProgram'].isin(key_list)]

elif name == 'df_masters':

df_mean = df_masters[df_masters['YearsProgram'].isin(key_list)]

else:

df_mean = df_degree[df_degree['YearsProgram'].isin(key_list)]

salary = df_mean['Salary'].mean()

return salary

def input_vales(name):

for i in range(0,4):

key = keys_2017[i]

value = get_mean_salary(name,key)

dict_2017_tabs.update({key:value})

print(dict_2017_tabs)

print('\n')

df = pd.read_csv('survey_results_public.csv')

df_prof = df[df['Professional']=='Professional developer']

df_tabs = df_prof[df_prof['TabsSpaces']== 'Tabs']

df_spaces = df_prof[df_prof['TabsSpaces']== 'Spaces']

df_masters = df[df['FormalEducation']=="Master's degree"]

df_degree = df[df['FormalEducation']=="Bachelor's degree"]

input_vales('df_tabs')

input_vales('df_spaces')

input_vales('df_masters')

input_vales('df_degree')

]]>

I will use supervised machine learning to predict survivability on the Titanic. The passenger list is available

here: http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls

Looking at my 'Titanic Survivability' post, here, it is clear that different groups had different survivability

rates. This can be used to 'train' an algorithm which can then be used to predict survivability for a test

dataset.

Before getting into the machine learning the data needs to be prepared. This is best done with Pandas.

The following script:

import pandas as pd

df = pd.read_csv('titanic3.csv')

print(df.count())

generates the following output:

pclass 1309

survived 1309

name 1309

sex 1309

age 1046

sibsp 1309

parch 1309

ticket 1309

fare 1308

cabin 295

embarked 1307

boat 486

body 121

home.dest 745

We can see that some fields (such as boat, body and cabin) have a lot of missing values. There are different

ways of dealing with missing values, the easiest is to drop some of these fields. Note Machine learning

algorithms don't cope well with blanks.

The command: df = df.drop(['body','cabin','boat','home.dest','name','ticket'],axis=1) will achieve this.

We can also drop NAs: df = df.dropna()

Running this and getting the count() again gives:

pclass 1043

survived 1043

sex 1043

age 1043

sibsp 1043

parch 1043

fare 1043

embarked 1043

I just dropped the rows with age = NA, some people may have approached the problem by replacing the NAs

by substituting the average age.

The next problem is the 'sex' and 'embarked' fields. They have string values - male & female. The machine learning algorithm works with numerical data so we can assign 1 to male and 0 to female.

processed_df = df.copy()

le = preprocessing.LabelEncoder()

processed_df.sex = le.fit_transform(processed_df.sex)

processed_df.embarked = le.fit_transform(processed_df.embarked)

We now have numerical data:

pclass survived sex age sibsp parch fare embarked

0 1 1 0 29.0000 0 0 211.3375 2

1 1 1 1 0.9167 1 2 151.5500 2

2 1 0 0 2.0000 1 2 151.5500 2

3 1 0 1 30.0000 1 2 151.5500 2

4 1 0 0 25.0000 1 2 151.5500 2

We now generate two numpy.ndarray objects, X and y:

X = processed_df.drop(['survived'], axis=1).values

y = processed_df['survived'].values

To create the training and test data sets we need to split:

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

We can now start testing some different machine learning algorithms:

clf_dt = tree.DecisionTreeClassifier(max_depth=10)

clf_dt.fit (X_train, y_train)

print(clf_dt.score (X_test, y_test))

This decision tree classifier generates predictions on the test dataset with an accuracy of 77%.

To introduce more randomness into the test/training split we introduce a shuffle validator:

shuffle_validator = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2, random_state=0)

and again run the decision tree:

test_classifier(clf_dt)

this gives a prediction accuracy of:

Accuracy: 0.7715 (+/- 0.02) .... so the model was getting correct predictions between about 79% and 75%

The next model is random forest:

clf_rf = ske.RandomForestClassifier(n_estimators=50)

print(test_classifier(clf_rf))

which gives a slightly better result:

Accuracy: 0.7880 (+/- 0.03) but it takes a lot longer to run (at least on my low power laptop)

We can also try a gradient boost:

clf_gb = ske.GradientBoostingClassifier(n_estimators=50)

print(test_classifier(clf_gb))

This gives a better result:

Accuracy: 0.8201 (+/- 0.02) ... so a best result of 84% correct.

The last model is the voting classifier:

eclf = ske.VotingClassifier([('dt', clf_dt), ('rf', clf_rf), ('gb', clf_gb)])

print(test_classifier(eclf))

This gives an accuracy of: Accuracy: 0.8024 (+/- 0.02)

The Gradient Boost seemed to give the best results for me and the decision tree classifier gave the worst.

import pandas as pd

from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics

import sklearn.ensemble as ske

import random

def test_classifier(clf):

scores = cross_validation.cross_val_score(clf, X, y, cv=shuffle_validator)

print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))

df = pd.read_csv('titanic3.csv')

df = df.drop(['body','cabin','boat','home.dest','name','ticket'],axis=1)

df = df.dropna()

processed_df = df.copy()

le = preprocessing.LabelEncoder()

processed_df.sex = le.fit_transform(processed_df.sex)

processed_df.embarked = le.fit_transform(processed_df.embarked)

X = processed_df.drop(['survived'], axis=1).values

y = processed_df['survived'].values

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

clf_dt = tree.DecisionTreeClassifier(max_depth=10)

clf_dt.fit (X_train, y_train)

shuffle_validator = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2, random_state=0)

test_classifier(clf_dt)

clf_rf = ske.RandomForestClassifier(n_estimators=50)

print(test_classifier(clf_rf))

clf_gb = ske.GradientBoostingClassifier(n_estimators=50)

print(test_classifier(clf_gb))

eclf = ske.VotingClassifier([('dt', clf_dt), ('rf', clf_rf), ('gb', clf_gb)])

print(test_classifier(eclf))]]>