evaluate.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

#!/usr/bin/python
# EE4 Selected Topics From Computer Vision Coursework
# Vasil Zlatanov, Nunzio Pucci

DATA_FILE = 'data.npz'
CLUSTER_CNT = 256
KMEANS = False

if KMEANS:
    N_ESTIMATORS = 1000
else:
    N_ESTIMATORS = 1

import numpy as np
import matplotlib.pyplot as plt
import scikitplot as skplt

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomTreesEmbedding

data = np.load(DATA_FILE)

train = data['desc_tr']
test = data['desc_te']
train_part = data['desc_sel'].T

if (KMEANS):
    print("Computing KMeans with", train_part.shape[0], "keywords")
    kmeans = KMeans(n_clusters=CLUSTER_CNT, n_init=N_ESTIMATORS, random_state=0).fit(train_part)
else:
    trees = RandomTreesEmbedding(max_leaf_nodes=CLUSTER_CNT, n_estimators=N_ESTIMATORS, random_state=0).fit(train_part)


print("Generating histograms")

def make_histogram(data):
    histogram = np.zeros((data.shape[0], data.shape[1],CLUSTER_CNT*N_ESTIMATORS)) 
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            print(data[i][j].shape)
            if (KMEANS):
                histogram[i][j] = np.bincount(kmeans.predict(data[i][j].T),minlength=CLUSTER_CNT)
            else:
                leaves = trees.apply(data[i][j].T)
                leaves = np.apply_along_axis(np.bincount, axis=0, arr=leaves, minlength=CLUSTER_CNT)
                print(leaves.shape)
                histogram[i][j] = leaves.reshape(CLUSTER_CNT*N_ESTIMATORS)
    return histogram

hist_train = make_histogram(train)
hist_test = make_histogram(test)

print("Keywords shape", hist_train.shape, "\n")
print("Planting trees...")
clf = RandomForestClassifier()
clf.fit(
        hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2])),
        np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1]))

print("Random forests created")

test_pred = clf.predict(hist_test.reshape((hist_test.shape[0]*hist_test.shape[1], hist_test.shape[2])))
test_label = np.repeat(np.arange(hist_test.shape[0]), hist_test.shape[1])

skplt.metrics.plot_confusion_matrix(test_pred, test_label, normalize=True)
plt.show()