evaluate.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

#!/usr/bin/python
# EE4 Selected Topics From Computer Vision Coursework
# Vasil Zlatanov, Nunzio Pucci

import numpy as np
import matplotlib.pyplot as plt
import scikitplot as skplt
import argparse
from timeit import default_timer as timer
import logging
from logging import debug
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomTreesEmbedding
from sklearn.metrics import accuracy_score

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--data", help="Data path", action='store_true', default='data.npz')
parser.add_argument("-c", "--conf_mat", help="Show visual confusion matrix", action='store_true')
parser.add_argument("-k", "--kmean", help="Perform kmean clustering with --kmean cluster centers", type=int, default=0)
parser.add_argument("-l", "--leaves", help="Maximum leaf nodes for RF classifier", type=int, default=256)
parser.add_argument("-e", "--estimators", help="number of estimators to be used", type=int, default=100)
parser.add_argument("-v", "--verbose", help="Use verbose output", action='store_true')

args = parser.parse_args()
if args.verbose:
    logging.basicConfig(level=logging.DEBUG)

data = np.load(args.data)
train = data['desc_tr']
test = data['desc_te']
train_part = data['desc_sel'].T
logging.debug("Verbose is on")

if (args.kmean):
    logging.debug("Computing KMeans with", train_part.shape[0], "keywords")
    kmeans = KMeans(n_clusters=args.kmean, n_init=args.estimators, random_state=0).fit(train_part)
else:
    trees = RandomTreesEmbedding(max_leaf_nodes=args.leaves, n_estimators=args.estimators, random_state=0).fit(train_part)


logging.debug("Generating histograms")

def make_histogram(data):
    if args.kmean:
        hist_size = args.estimators*args.kmean
    else:
        hist_size = args.estimators*args.leaves

    histogram = np.zeros((data.shape[0], data.shape[1],hist_size)) 
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            if (args.kmean):
                histogram[i][j] = np.bincount(kmeans.predict(data[i][j].T),minlength=args.kmean)
            else:
                leaves = trees.apply(data[i][j].T)
                leaves = np.apply_along_axis(np.bincount, axis=0, arr=leaves, minlength=args.leaves)
                histogram[i][j] = leaves.reshape(hist_size)
    return histogram

hist_train = make_histogram(train)
hist_test = make_histogram(test)

logging.debug("Keywords shape", hist_train.shape, "\n")
logging.debug("Planting trees...")
clf = RandomForestClassifier()
clf.fit(
        hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2])),
        np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1]))

logging.debug("Random forests created")

test_pred = clf.predict(hist_test.reshape((hist_test.shape[0]*hist_test.shape[1], hist_test.shape[2])))
test_label = np.repeat(np.arange(hist_test.shape[0]), hist_test.shape[1])

print(accuracy_score(test_pred, test_label))

if args.conf_mat:
    skplt.metrics.plot_confusion_matrix(test_pred, test_label, normalize=True)
    plt.show()