1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
#!/usr/bin/python
# EE4 Selected Topics From Computer Vision Coursework
# Vasil Zlatanov, Nunzio Pucci
DATA_FILE = 'data.npz'
CLUSTER_CNT = 256
KMEANS = False
if KMEANS:
N_ESTIMATORS = 1000
else:
N_ESTIMATORS = 1
import numpy as np
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomTreesEmbedding
data = np.load(DATA_FILE)
train = data['desc_tr']
test = data['desc_te']
train_part = data['desc_sel'].T
if (KMEANS):
print("Computing KMeans with", train_part.shape[0], "keywords")
kmeans = KMeans(n_clusters=CLUSTER_CNT, n_init=N_ESTIMATORS, random_state=0).fit(train_part)
else:
trees = RandomTreesEmbedding(max_leaf_nodes=CLUSTER_CNT, n_estimators=N_ESTIMATORS, random_state=0).fit(train_part)
print("Generating histograms")
def make_histogram(data):
histogram = np.zeros((data.shape[0], data.shape[1],CLUSTER_CNT*N_ESTIMATORS))
for i in range(data.shape[0]):
for j in range(data.shape[1]):
print(data[i][j].shape)
if (KMEANS):
histogram[i][j] = np.bincount(kmeans.predict(data[i][j].T),minlength=CLUSTER_CNT)
else:
leaves = trees.apply(data[i][j].T)
leaves = np.apply_along_axis(np.bincount, axis=0, arr=leaves, minlength=CLUSTER_CNT)
print(leaves.shape)
histogram[i][j] = leaves.reshape(CLUSTER_CNT*N_ESTIMATORS)
return histogram
hist_train = make_histogram(train)
hist_test = make_histogram(test)
print("Keywords shape", hist_train.shape, "\n")
print("Planting trees...")
clf = RandomForestClassifier()
clf.fit(
hist_train.reshape((hist_train.shape[0]*hist_train.shape[1], hist_train.shape[2])),
np.repeat(np.arange(hist_train.shape[0]), hist_train.shape[1]))
print("Random forests created")
test_pred = clf.predict(hist_test.reshape((hist_test.shape[0]*hist_test.shape[1], hist_test.shape[2])))
test_label = np.repeat(np.arange(hist_test.shape[0]), hist_test.shape[1])
skplt.metrics.plot_confusion_matrix(test_pred, test_label, normalize=True)
plt.show()
|