Move part1 parts to seperate folder

author: Vasil Zlatanov <vasko@e4-pattern-vm.europe-west4-a.c.electric-orbit-223819.internal> 2018-12-05 16:36:15 +0000
committer: Vasil Zlatanov <vasko@e4-pattern-vm.europe-west4-a.c.electric-orbit-223819.internal> 2018-12-05 16:36:15 +0000
commit: e42170b70bb9710d73ff22fcd06ae8724a78cbd1 (patch)
tree: 3edc5777e62537b1c79140d89b648b3829564b68 /part1/train.py
parent: bcd380b631184e9d4e58c0aa80afb17727581066 (diff)
download: vz215_np1915-e42170b70bb9710d73ff22fcd06ae8724a78cbd1.tar.gz
vz215_np1915-e42170b70bb9710d73ff22fcd06ae8724a78cbd1.tar.bz2
vz215_np1915-e42170b70bb9710d73ff22fcd06ae8724a78cbd1.zip
1 files changed, 286 insertions, 0 deletions
diff --git a/part1/train.py b/part1/train.py
new file mode 100755
index 0000000..c5d4389
--- /dev/null
+++ b/part1/train.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python
+# Author: Vasil Zlatanov, Nunzio Pucci
+# EE4 Pattern Recognition coursework
+#
+# usage: train.py [-h] -i DATA [-m EIGEN] [-M REIGEN] [-e ENSEMBLE] [-b]
+#                 [-R RANDOM] [-n NEIGHBORS] [-f FACES] [-c] [-s SEED]
+#                 [-t SPLIT] [-2] [-p] [-l] [-r RECONSTRUCT] [-cm] [-q] [-pr]
+#                 [-alt]
+
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+import sys
+import random
+import os
+import psutil
+from random import randint
+
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.decomposition import PCA
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import accuracy_score
+
+import argparse
+import numpy as np
+
+from numpy import genfromtxt
+from numpy import linalg as LA
+
+from timeit import default_timer as timer
+
+n_faces = 52
+n_cases = 10
+n_pixels = 2576
+
+# subtract the normal face from each row of the face matrix
+def normalise_faces(deviations_tr, average_face, faces):
+    faces = np.subtract(faces, np.tile(average_face, (faces.shape[0],1)))
+    return np.divide(faces, deviations_tr)
+
+# Split data into training and testing sets
+def test_split(n_faces, raw_faces, split, seed):
+    random.seed(seed)
+
+    raw_faces_split = np.split(raw_faces,n_cases)
+    n_training_faces = int(round(n_cases*(1 - split)))
+    n_test_faces = n_cases - n_training_faces
+    faces_train = np.zeros((n_faces, n_training_faces, n_pixels))
+    faces_test = np.zeros((n_faces, n_test_faces, n_pixels))
+    target_train = np.repeat(np.arange(n_faces), n_training_faces)
+    target_test = np.repeat(np.arange(n_faces), n_test_faces)
+
+    for x in range(n_faces):
+        samples = random.sample(range(n_cases), n_training_faces)
+        faces_train[x] = [raw_faces[i+n_cases*x] for i in samples]
+        faces_test[x] = [raw_faces[i+n_cases*x] for i in range(n_cases) if i not in samples]
+
+    faces_train = faces_train.reshape(n_faces*n_training_faces, n_pixels)
+    faces_test = faces_test.reshape(n_faces*n_test_faces, n_pixels)
+    return faces_train, faces_test, target_train, target_test
+
+def draw_results(args, target_test, target_pred):
+    acc_sc = accuracy_score(target_test, target_pred)
+    cm = confusion_matrix(target_test, target_pred)
+    print('Accuracy: ', acc_sc)
+    if (args.conf_mat):
+        plt.matshow(cm, cmap='Blues')
+        plt.colorbar()
+        plt.ylabel('Actual')
+        plt.xlabel('Predicted')
+        plt.show()
+    return
+
+def test_model(M, faces_train, faces_test, target_train, target_test, args):
+    raw_faces_train = faces_train
+    raw_faces_test = faces_test
+
+    explained_variances = ()
+
+
+    distances = np.zeros(faces_test.shape[0])
+
+    if args.pca or args.pca_r:
+        # faces_pca containcts the principial components or the M most variant eigenvectors
+        average_face = np.mean(faces_train, axis=0)
+        if args.classifyalt:
+            deviations_tr = np.ones(n_pixels)
+        else:
+            deviations_tr =  np.std(faces_train, axis=0)
+        faces_train = normalise_faces(deviations_tr, average_face, faces_train)
+        faces_test = normalise_faces(deviations_tr, average_face, faces_test)
+        if (args.pca_r):
+            e_vals, e_vecs = LA.eigh(np.dot(faces_train, faces_train.T))
+            e_vecs = np.dot(faces_train.T, e_vecs)
+            e_vecs = e_vecs/LA.norm(e_vecs, axis = 0)
+        else:
+            e_vals, e_vecs = LA.eigh(np.cov(faces_train.T))
+
+        e_vals = np.flip(e_vals)
+        e_vecs = np.fliplr(e_vecs).T
+
+        if args.random:
+            random_features = random.sample(range(M-args.random, M), args.random)
+            for i in range(args.random):
+                e_vals[M-i] = e_vals[random_features[i]]
+                e_vecs[M-i] = e_vecs[random_features[i]]
+
+        e_vals = e_vals[:M]
+        e_vecs = e_vecs[:M]
+
+        deviations_tr = np.flip(deviations_tr)
+        faces_train = np.dot(faces_train, e_vecs.T)
+        faces_test = np.dot(faces_test, e_vecs.T)
+
+        rec_vecs = np.add(np.tile(average_face,
+            (faces_test.shape[0], 1)), np.dot(faces_test, e_vecs) * deviations_tr)
+        distances = LA.norm(raw_faces_test - rec_vecs, axis=1);
+
+        if args.reconstruct:
+            rec_vec = np.add(average_face, np.dot(faces_train[args.reconstruct], e_vecs) * deviations_tr)
+            ar = plt.subplot(2, 1, 1)
+            ar.imshow(rec_vec.reshape([46,56]).T, cmap = 'gist_gray')
+            ar = plt.subplot(2, 1, 2)
+            ar.imshow(raw_faces_train[args.reconstruct].reshape([46,56]).T, cmap = 'gist_gray')
+            plt.show()
+
+    if args.lda:
+        if args.pca_r or (args.pca and M > n_training_faces - n_faces):
+            lda = LinearDiscriminantAnalysis(n_components=M, solver='svd')
+        else:
+            lda = LinearDiscriminantAnalysis(n_components=M, store_covariance='True')
+
+        faces_train = lda.fit_transform(faces_train, target_train)
+        faces_test = lda.transform(faces_test)
+        class_means = lda.means_
+        e_vals = lda.explained_variance_ratio_
+
+    if args.faces:
+        if args.lda:
+            for i in range(10):
+                ax = plt.subplot(2, 5, i + 1)
+                ax.imshow(class_means[i].reshape([46,56]).T)
+        else:
+            for i in range(args.faces):
+                ax = plt.subplot(2, args.faces/2, i + 1)
+                ax.imshow(e_vecs[i].reshape([46, 56]).T, cmap = 'gist_gray')
+        plt.show()
+
+    if args.principal:
+        e_vals = np.multiply(np.divide(e_vals, np.sum(e_vals)), 100)
+        plt.bar(np.arange(M), e_vals[:M])
+        plt.ylabel('Varaiance ratio (%)');plt.xlabel('Number')
+        plt.show()
+
+    if args.grapheigen:
+        # Colors for distinct individuals
+        cols = ['#{:06x}'.format(randint(0, 0xffffff)) for i in range(n_faces)]
+        pltCol = [cols[int(k)] for k in target_train]
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+        ax.scatter(faces_train[:, 0], faces_train[:, 1], faces_train[:, 2], marker='o', color=pltCol)
+        plt.show()
+
+    classifier = KNeighborsClassifier(n_neighbors=args.neighbors)
+    classifier.fit(faces_train, target_train)
+    target_pred = classifier.predict(faces_test)
+    if args.prob:
+        targer_prob = classifier.predict_proba(faces_test)
+        targer_prob_vec = np.zeros(104)
+        for i in range (104):
+            j = int(np.floor(i/2))
+            targer_prob_vec [i] = targer_prob[i][j]
+        avg_targer_prob = np.zeros(n_faces)
+        for i in range (n_faces):
+            avg_targer_prob[i] = (targer_prob_vec[2*i] + targer_prob_vec[2*i + 1])/2
+        plt.bar(range(n_faces), avg_targer_prob)
+        plt.show()
+
+    return target_pred, distances
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--data", help="Input CSV file", required=True)
+    parser.add_argument("-m", "--eigen", help="Number of eigenvalues in model", type=int, default = 10 )
+    parser.add_argument("-M", "--reigen", help="Number of eigenvalues in model", type=int)
+    parser.add_argument("-e", "--ensemble", help="Number of ensemmbles to use", type=int, default = 0)
+    parser.add_argument("-b", "--bagging", help="Number of bags to use", action='store_true')
+    parser.add_argument("-R", "--random", help="Number of eigen value to randomise", type=int)
+    parser.add_argument("-n", "--neighbors", help="How many neighbors to use", type=int, default = 1)
+    parser.add_argument("-f", "--faces", help="Show faces", type=int, default = 0)
+    parser.add_argument("-c", "--principal", help="Show principal components", action='store_true')
+    parser.add_argument("-s", "--seed", help="Seed to use", type=int, default=0)
+    parser.add_argument("-t", "--split", help="Fractoin of data to use for testing", type=float, default=0.3)
+    parser.add_argument("-2", "--grapheigen", help="Swow 2D graph of targets versus principal components",
+            action='store_true')
+    parser.add_argument("-p", "--pca", help="Use PCA", action='store_true')
+    parser.add_argument("-l", "--lda", help="Use LDA", action='store_true')
+    parser.add_argument("-r", "--reconstruct", help="Use PCA reconstruction, specify face NR", type=int, default=0)
+    parser.add_argument("-cm", "--conf_mat", help="Show visual confusion matrix", action='store_true')
+    parser.add_argument("-q", "--pca_r", help="Use Reduced PCA", action='store_true')
+    parser.add_argument("-pr", "--prob", help="Certainty on each guess", action='store_true')
+    parser.add_argument("-alt", "--classifyalt", help="Alternative method ON", action='store_true')
+    args = parser.parse_args()
+
+    if args.lda and args.classifyalt:
+        sys.exit("LDA and Alt PCA can not be performed together")
+
+    raw_faces = genfromtxt(args.data, delimiter=',')
+    targets = np.repeat(np.arange(n_faces),n_cases)
+
+    faces_train, faces_test, target_train, target_test = test_split(n_faces, raw_faces, args.split, args.seed)
+
+    if args.ensemble:
+        n_training_faces = int(round(n_cases*(1 - args.split)))
+        faces_train_ens = np.zeros((args.ensemble, n_faces, n_training_faces, n_pixels))
+        for x in range(args.ensemble):
+            if args.bagging:
+                for k in range(n_faces):
+                    samples = random.choices(range(n_training_faces), k=n_training_faces)
+                    faces_train_ens[x][k] = [faces_train[i+n_training_faces*k] for i in samples]
+            else:
+                faces_train_ens[x] = faces_train.reshape((n_faces, n_training_faces, n_pixels))
+
+        faces_train_ens = faces_train_ens.reshape(args.ensemble, n_faces*n_training_faces, n_pixels)
+
+    if args.classifyalt:
+        faces_train = faces_train.reshape(n_faces, int(faces_train.shape[0]/n_faces), n_pixels)
+        target_train = target_train.reshape(n_faces, int(target_train.shape[0]/n_faces))
+
+        distances = np.zeros((n_faces, faces_test.shape[0]))
+        for i in range(n_faces):
+            target_pred, distances[i] = test_model(args.eigen, faces_train[i],
+                    faces_test, target_train[i], target_test, args)
+        target_pred = np.argmin(distances, axis=0)
+    elif args.reigen:
+        target_pred = np.zeros((args.reigen-args.eigen, target_test.shape[0]))
+        accuracy    = np.zeros(args.reigen-args.eigen)
+        rec_error   = np.zeros((args.reigen-args.eigen, target_test.shape[0]))
+
+        for M in range(args.eigen, args.reigen):
+            start = timer()
+            target_pred[M - args.eigen], rec_error[M - args.eigen] = test_model(M, faces_train,
+                    faces_test, target_train, target_test, args)
+            end = timer()
+            print("Run with", M, "eigenvalues completed in ", end-start, "seconds")
+            print("Memory Used:", psutil.Process(os.getpid()).memory_info().rss)
+            accuracy[M - args.eigen] = accuracy_score(target_test, target_pred[M-args.eigen])
+        # Plot
+        print('Max efficiency of ', max(accuracy), '% for M =', np.argmax(accuracy))
+        plt.plot(range(args.eigen, args.reigen), 100*accuracy)
+        plt.xlabel('Number of Eigenvectors used (M)')
+        plt.ylabel('Recognition Accuracy (%)')
+        plt.grid(True)
+        plt.show()
+    elif args.ensemble:
+        rec_error = np.zeros((args.ensemble, n_faces, faces_test.shape[0]))
+        target_pred = np.zeros((args.ensemble, target_test.shape[0]))
+        for i in range(args.ensemble):
+            target_pred[i], rec_error[i] = test_model(args.eigen, faces_train_ens[i],
+                    faces_test, target_train, target_test, args)
+
+        target_pred_comb = np.zeros(target_pred.shape[1])
+        target_pred = target_pred.astype(int).T
+        if (args.conf_mat):
+            cm = confusion_matrix(np.tile(target_test, args.ensemble), target_pred.flatten('F'))
+            plt.matshow(cm, cmap='Blues')
+            plt.colorbar()
+            plt.ylabel('Actual')
+            plt.xlabel('Predicted')
+            plt.show()
+
+        for i in range(target_pred.shape[0]):
+            target_pred_comb[i] = np.bincount(target_pred[i]).argmax()
+        target_pred = target_pred_comb
+    else:
+        M = args.eigen
+        start = timer()
+        target_pred, distances = test_model(M, faces_train, faces_test, target_train, target_test, args)
+        end = timer()
+
+    draw_results(args, target_test, target_pred)
+
+if __name__ == "__main__":
+    main()
author	Vasil Zlatanov <vasko@e4-pattern-vm.europe-west4-a.c.electric-orbit-223819.internal>	2018-12-05 16:36:15 +0000
committer	Vasil Zlatanov <vasko@e4-pattern-vm.europe-west4-a.c.electric-orbit-223819.internal>	2018-12-05 16:36:15 +0000
commit	e42170b70bb9710d73ff22fcd06ae8724a78cbd1 (patch)
tree	3edc5777e62537b1c79140d89b648b3829564b68 /part1/train.py
parent	bcd380b631184e9d4e58c0aa80afb17727581066 (diff)
download	vz215_np1915-e42170b70bb9710d73ff22fcd06ae8724a78cbd1.tar.gz vz215_np1915-e42170b70bb9710d73ff22fcd06ae8724a78cbd1.tar.bz2 vz215_np1915-e42170b70bb9710d73ff22fcd06ae8724a78cbd1.zip