diff options
author | Vasil Zlatanov <v@skozl.com> | 2019-06-24 21:31:41 +0100 |
---|---|---|
committer | Vasil Zlatanov <v@skozl.com> | 2019-06-24 21:31:41 +0100 |
commit | 02f4425ceadec0c10c1e9903286c1121a9313357 (patch) | |
tree | cc8b2447ec0a5fa8d38926fd516823c62341c3d8 /util | |
download | e4-fyp-netcraft-master.tar.gz e4-fyp-netcraft-master.tar.bz2 e4-fyp-netcraft-master.zip |
Diffstat (limited to 'util')
-rw-r--r-- | util/__pycache__/dataset_utils.cpython-37.pyc | bin | 0 -> 4355 bytes | |||
-rw-r--r-- | util/dataset_utils.py | 150 | ||||
-rwxr-xr-x | util/generate_smaller.sh | 22 | ||||
-rwxr-xr-x | util/make-tfrecords.py | 203 | ||||
-rwxr-xr-x | util/plot-report | 102 | ||||
-rwxr-xr-x | util/plot-softmax | 94 | ||||
-rwxr-xr-x | util/splitter | 30 | ||||
-rwxr-xr-x | util/splitter-man | 30 | ||||
-rw-r--r-- | util/test.dump | 0 |
9 files changed, 631 insertions, 0 deletions
diff --git a/util/__pycache__/dataset_utils.cpython-37.pyc b/util/__pycache__/dataset_utils.cpython-37.pyc Binary files differnew file mode 100644 index 0000000..7ed2852 --- /dev/null +++ b/util/__pycache__/dataset_utils.cpython-37.pyc diff --git a/util/dataset_utils.py b/util/dataset_utils.py new file mode 100644 index 0000000..fdaefca --- /dev/null +++ b/util/dataset_utils.py @@ -0,0 +1,150 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains utilities for downloading and converting datasets.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import tarfile + +from six.moves import urllib +import tensorflow as tf + +LABELS_FILENAME = 'labels.txt' + + +def int64_feature(values): + """Returns a TF-Feature of int64s. + + Args: + values: A scalar or list of values. + + Returns: + A TF-Feature. + """ + if not isinstance(values, (tuple, list)): + values = [values] + return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) + + +def bytes_feature(values): + """Returns a TF-Feature of bytes. + + Args: + values: A string. + + Returns: + A TF-Feature. + """ + return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) + + +def float_feature(values): + """Returns a TF-Feature of floats. + + Args: + values: A scalar of list of values. + + Returns: + A TF-Feature. + """ + if not isinstance(values, (tuple, list)): + values = [values] + return tf.train.Feature(float_list=tf.train.FloatList(value=values)) + + +def image_to_tfexample(image_data, image_format, height, width, class_id): + return tf.train.Example(features=tf.train.Features(feature={ + 'image/encoded': bytes_feature(image_data), + 'image/format': bytes_feature(image_format), + 'image/class/label': int64_feature(class_id), + 'image/height': int64_feature(height), + 'image/width': int64_feature(width), + })) + + +def download_and_uncompress_tarball(tarball_url, dataset_dir): + """Downloads the `tarball_url` and uncompresses it locally. + + Args: + tarball_url: The URL of a tarball file. + dataset_dir: The directory where the temporary files are stored. + """ + filename = tarball_url.split('/')[-1] + filepath = os.path.join(dataset_dir, filename) + + def _progress(count, block_size, total_size): + sys.stdout.write('\r>> Downloading %s %.1f%%' % ( + filename, float(count * block_size) / float(total_size) * 100.0)) + sys.stdout.flush() + filepath, _ = urllib.request.urlretrieve(tarball_url, filepath, _progress) + print() + statinfo = os.stat(filepath) + print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') + tarfile.open(filepath, 'r:gz').extractall(dataset_dir) + + +def write_label_file(labels_to_class_names, dataset_dir, + filename=LABELS_FILENAME): + """Writes a file with the list of class names. + + Args: + labels_to_class_names: A map of (integer) labels to class names. + dataset_dir: The directory in which the labels file should be written. + filename: The filename where the class names are written. + """ + labels_filename = os.path.join(dataset_dir, filename) + with tf.gfile.Open(labels_filename, 'w') as f: + for label in labels_to_class_names: + class_name = labels_to_class_names[label] + f.write('%d:%s\n' % (label, class_name)) + + +def has_labels(dataset_dir, filename=LABELS_FILENAME): + """Specifies whether or not the dataset directory contains a label map file. + + Args: + dataset_dir: The directory in which the labels file is found. + filename: The filename where the class names are written. + + Returns: + `True` if the labels file exists and `False` otherwise. + """ + return tf.gfile.Exists(os.path.join(dataset_dir, filename)) + + +def read_label_file(dataset_dir, filename=LABELS_FILENAME): + """Reads the labels file and returns a mapping from ID to class name. + + Args: + dataset_dir: The directory in which the labels file is found. + filename: The filename where the class names are written. + + Returns: + A map from a label (integer) to class name. + """ + labels_filename = os.path.join(dataset_dir, filename) + with tf.gfile.Open(labels_filename, 'rb') as f: + lines = f.read().decode() + lines = lines.split('\n') + lines = filter(None, lines) + + labels_to_class_names = {} + for line in lines: + index = line.index(':') + labels_to_class_names[int(line[:index])] = line[index+1:] + return labels_to_class_names diff --git a/util/generate_smaller.sh b/util/generate_smaller.sh new file mode 100755 index 0000000..68769b4 --- /dev/null +++ b/util/generate_smaller.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +OUTPUT_DIR='better_noise_logos' + +for width in $(seq 16 1 96);do + echo "Making width of $width" + for brand in $1/*;do + mkdir -p "$OUTPUT_DIR/$brand" + + find $brand -iname '*.png' | while read image;do + BRI=$((80 + RANDOM % 40)) + SAT=$((80 + RANDOM % 40)) + HUE=$((60 + RANDOM % 80)) + width_off=$((RANDOM % (320-width))) + height_off=$((RANDOM % (240-width))) + convert -size 320x240 xc: +noise Random $image -modulate $BRI,$SAT,$HUE -geometry ${width}x+${width_off}+${height_off} -composite "$OUTPUT_DIR/$brand/`basename $image`.$width.png" + sleep 1 + done + done + +done + diff --git a/util/make-tfrecords.py b/util/make-tfrecords.py new file mode 100755 index 0000000..f9f3e71 --- /dev/null +++ b/util/make-tfrecords.py @@ -0,0 +1,203 @@ +#!/usr/bin/python +import random +import os +import sys +import math +import tensorflow as tf +import dataset_utils +import numpy as np + +#===============DEFINE YOUR ARGUMENTS============== +flags = tf.app.flags + +#State your dataset directory +flags.DEFINE_string('data', None, 'String: Your dataset directory') + +# The number of images in the validation set. You would have to know the total number of examples in advance. This is essentially your evaluation dataset. +flags.DEFINE_float('validation_size', 0.25, 'Float: The proportion of examples in the dataset to be used for validation') + +# The number of shards per dataset split. +flags.DEFINE_integer('num_shards', 1, 'Int: Number of shards to split the TFRecord files') + +# Seed for repeatability. +flags.DEFINE_integer('random_seed', 0, 'Int: Random seed to use for repeatability.') +flags.DEFINE_bool('overwrite', False, 'Overwrite prevoiusly generated files') + +FLAGS = flags.FLAGS + +class ImageReader(object): + """Helper class that provides TensorFlow image coding utilities.""" + + def __init__(self): + # Initializes function that decodes RGB JPEG data. + self._decode_png_data = tf.placeholder(dtype=tf.string) + self._decode_png = tf.image.decode_png(self._decode_png_data, channels=0) + + def read_image_dims(self, sess, image_data): + image = self.decode_png(sess, image_data) + return image.shape[0], image.shape[1] + + def decode_png(self, sess, image_data): + image = sess.run(self._decode_png, + feed_dict={self._decode_png_data: image_data}) + assert len(image.shape) == 3 + return image + +def _get_filenames_and_classes(data): + """Returns a list of filenames and inferred class names. + + Args: + data: A directory containing a set of subdirectories representing + class names. Each subdirectory should contain PNG or JPG encoded images. + + Returns: + A list of image file paths, relative to `data` and the list of + subdirectories, representing class names. + """ + directories = [] + class_names = [] + for filename in os.listdir(data): + path = os.path.join(data, filename) + if os.path.isdir(path): + print(path) + directories.append(path) + class_names.append(filename) + + photo_filenames = [] + for directory in directories: + for filename in os.listdir(directory): + path = os.path.join(directory, filename) + photo_filenames.append(path) + + return photo_filenames, sorted(class_names) + + +def _get_dataset_filename(data, split_name, shard_id, _NUM_SHARDS): + output_filename = 'websites_%s_%05d-of-%05d.tfrecord' % ( + split_name, shard_id, _NUM_SHARDS) + return os.path.join(data, output_filename) + + +def _convert_dataset(split_name, filenames, class_names_to_ids, data, _NUM_SHARDS): + """Converts the given filenames to a TFRecord dataset. + + Args: + split_name: The name of the dataset, either 'train' or 'validation'. + filenames: A list of absolute paths to png or jpg images. + class_names_to_ids: A dictionary from class names (strings) to ids + (integers). + data: The directory where the converted datasets are stored. + """ + assert split_name in ['train', 'validation'] + + failed = 0 + success = 0 + # class_cnts is used for balancing training through class_weights + class_cnts = [0] * len(class_names_to_ids) + num_per_shard = int(math.ceil(len(filenames) / float(_NUM_SHARDS))) + + with tf.Graph().as_default(): + image_reader = ImageReader() + + with tf.Session('') as sess: + + for shard_id in range(_NUM_SHARDS): + output_filename = _get_dataset_filename( + data, split_name, shard_id, _NUM_SHARDS) + + with tf.python_io.TFRecordWriter(output_filename) as tfrecord_writer: + start_ndx = shard_id * num_per_shard + end_ndx = min((shard_id+1) * num_per_shard, len(filenames)) + for i in range(start_ndx, end_ndx): +# sys.stdout.write('\r>> Converting image %d/%d shard %d: %s' % ( +# i+1, len(filenames), shard_id, filenames[i])) +# sys.stdout.flush() + + # Read the filename: + image_data = tf.gfile.FastGFile(filenames[i], 'rb').read() + try: + height, width = image_reader.read_image_dims(sess, image_data) + class_name = os.path.basename(os.path.dirname(filenames[i])) + class_id = class_names_to_ids[class_name] + + example = dataset_utils.image_to_tfexample( + image_data, b'png', height, width, class_id) + tfrecord_writer.write(example.SerializeToString()) + success += 1; + class_cnts[class_id] += 1; + except: + failed = failed + 1; + + + + + sys.stdout.write('%d in total failed!\n' % failed) + sys.stdout.write('%d in total were written successfuly!\n' % success) + sys.stdout.flush() + return class_cnts + + +def _dataset_exists(data, _NUM_SHARDS): + for split_name in ['train', 'validation']: + for shard_id in range(_NUM_SHARDS): + output_filename = _get_dataset_filename( + data, split_name, shard_id, _NUM_SHARDS) + if not tf.gfile.Exists(output_filename): + return False + return True + +def main(): + + #=============CHECKS============== + #Check if there is a dataset directory entered + if not FLAGS.data: + raise ValueError('data is empty. Please state a data argument.') + + #If the TFRecord files already exist in the directory, then exit without creating the files again + if not FLAGS.overwrite and _dataset_exists(data = FLAGS.data, _NUM_SHARDS = FLAGS.num_shards): + print('Dataset files already exist. Exiting without re-creating them.') + print('Use --overwrite flag or remove them') + return None + #==========END OF CHECKS============ + + #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories. + photo_filenames, class_names = _get_filenames_and_classes(FLAGS.data) + + #Refer each of the class name to a specific integer number for predictions later + class_names_to_ids = dict(zip(class_names, range(len(class_names)))) + + #Find the number of validation examples we need + num_validation = int(FLAGS.validation_size * len(photo_filenames)) + + # Divide the training datasets into train and test: + random.seed(FLAGS.random_seed) + random.shuffle(photo_filenames) + training_filenames = photo_filenames[num_validation:] + validation_filenames = photo_filenames[:num_validation] + + # First, convert the training and validation sets. + train_cnts = _convert_dataset('train', training_filenames, class_names_to_ids, + data = FLAGS.data, _NUM_SHARDS = 1) + val_cnts = _convert_dataset('validation', validation_filenames, class_names_to_ids, + data = FLAGS.data, _NUM_SHARDS = 1) + + # Finally, write the labels file: + labels_to_class_names = dict(zip(range(len(class_names)), class_names)) + dataset_utils.write_label_file(labels_to_class_names, FLAGS.data) + + total_train_cnt = sum(train_cnts) + class_cnt = len(train_cnts) + class_weights = [ total_train_cnt/(train_cnts[i]*class_cnt+1e-10) for i in range(class_cnt) ] + + data_info = os.path.join(FLAGS.data, 'dinfo.npz') + np.savez(data_info, train_cnt=total_train_cnt, + val_cnt=sum(val_cnts), + class_weights=class_weights, + classes=class_names + ) + + print('\nFinished converting the dataset!') + +if __name__ == "__main__": + main() + diff --git a/util/plot-report b/util/plot-report new file mode 100755 index 0000000..927437f --- /dev/null +++ b/util/plot-report @@ -0,0 +1,102 @@ +#!/usr/bin/python +import tensorflow as tf +import numpy as np +import matplotlib.pyplot as plt +import scikitplot as skplt + +from sklearn.preprocessing import label_binarize +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import auc, confusion_matrix +from sklearn.metrics import precision_recall_curve +from sklearn.metrics import average_precision_score +from sklearn.metrics import classification_report + +flags = tf.app.flags + +flags.DEFINE_string('softmax', None, 'The softmax.npz file contained labels and probas') +flags.DEFINE_string('dinfo', None, 'The dinfo.npz file') +flags.DEFINE_integer('chunks', 4, 'The number of plots to produce') + + +FLAGS = flags.FLAGS + + +def plot_classification_report(classification_report, title='Classification report ', cmap='RdBu'): + ''' + Plot scikit-learn classification report. + Extension based on https://stackoverflow.com/a/31689645/395857 + ''' + lines = classification_report.split('\n') + + classes = [] + plotMat = [] + support = [] + class_names = [] + for line in lines[2 : (len(lines) - 2)]: + t = line.strip().split() + if len(t) < 2: continue + classes.append(t[0]) + v = [float(x) for x in t[1: len(t) - 1]] + support.append(int(t[-1])) + class_names.append(t[0]) + print(v) + plotMat.append(v) + + print('plotMat: {0}'.format(plotMat)) + print('support: {0}'.format(support)) + + xlabel = 'Metrics' + ylabel = 'Classes' + xticklabels = ['Precision', 'Recall', 'F1-score'] + yticklabels = ['{0} ({1})'.format(class_names[idx], sup) for idx, sup in enumerate(support)] + figure_width = 25 + figure_height = len(class_names) + 7 + correct_orientation = False + heatmap(np.array(plotMat), title, xlabel, ylabel, xticklabels, yticklabels, figure_width, figure_height, correct_orientation, cmap=cmap) + +softmax = np.load(FLAGS.softmax) +dinfo = np.load(FLAGS.dinfo) + +class_names=dinfo['classes'] + +y_true = softmax['labels'] +y_proba = softmax['predictions'] + +y_true_sparse = label_binarize(y_true, classes=np.unique(y_true)) +y_pred = np.argmax(y_proba, axis=1) + +cl_report= classification_report(y_true, y_pred, target_names=class_names, labels=np.arange(len(class_names))) +print(cl_report) + +cm = confusion_matrix(y_true, y_pred, labels=np.arange(len(class_names))) +print(cm) + +def top_wrong(cm, N=5): + a=cm + N = 150 + idx = np.argsort(a.ravel())[-N:][::-1] #single slicing: `[:N-2:-1]` + topN_val = a.ravel()[idx] + row_col = np.c_[np.unravel_index(idx, a.shape)] + return row_col + +#print(top_wrong(cm)) +for idxs in top_wrong(cm): + if idxs[0] != idxs[1]: + print(class_names[idxs[0]],"\t",class_names[idxs[1]],"\t",cm[idxs[0], idxs[1]]) + +benign_class = np.where(class_names=='benign') + +benign_pages, _ = np.where(y_true == benign_class) + +cnt=0 +cnt9=0 +for benign_page in benign_pages: + guess = y_pred[benign_page] + if guess != benign_class: + softmax_val = y_proba[benign_page][guess] + cnt +=1 + if softmax_val > 0.95: + print("B: " + class_names[guess] + "\t" + str(softmax_val)) + cnt9 += 1 + +print('We have ' + str(cnt9) + ' false-positives with softmax > 0.95 out of ' +str(cnt) + '/' + str(benign_pages.size)) diff --git a/util/plot-softmax b/util/plot-softmax new file mode 100755 index 0000000..c6c2774 --- /dev/null +++ b/util/plot-softmax @@ -0,0 +1,94 @@ +#!/usr/bin/python +import tensorflow as tf +import numpy as np +import matplotlib.pyplot as plt +import scikitplot as skplt + +from sklearn.preprocessing import label_binarize +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import auc +from sklearn.metrics import precision_recall_curve +from sklearn.metrics import average_precision_score + +flags = tf.app.flags + +flags.DEFINE_string('softmax', None, 'The softmax.npz file contained labels and probas') +flags.DEFINE_string('dinfo', None, 'The dinfo.npz file') +flags.DEFINE_integer('chunks', 4, 'The number of plots to produce') + + +FLAGS = flags.FLAGS + +softmax = np.load(FLAGS.softmax) +dinfo = np.load(FLAGS.dinfo) + +class_names=dinfo['classes'] + +y_true = softmax['labels'] +y_proba = softmax['predictions'] + + +def plot_precision_recall(y_true, y_probas, + plot_micro=True, + classes_to_plot=None, ax=None, + figsize=None, cmap='nipy_spectral', + text_fontsize="medium"): + + y_true = np.array(y_true) + y_probas = np.array(y_probas) + + classes = np.unique(y_true) + probas = y_probas + + if classes_to_plot is None: + classes_to_plot = classes + + binarized_y_true = label_binarize(y_true, classes=classes) + if len(classes) == 2: + binarized_y_true = np.hstack( + (1 - binarized_y_true, binarized_y_true)) + + fig, ax = plt.subplots(int(FLAGS.chunks/2), 2, figsize=figsize) + chunk_size = int(len(classes)/FLAGS.chunks) + int(len(classes) % FLAGS.chunks > 0) + print('Chunk size', chunk_size) + + + + indices_to_plot = np.in1d(classes, classes_to_plot) + + for i, img_class in enumerate(classes): + average_precision = average_precision_score( + binarized_y_true[:, i], + probas[:, i]) + precision, recall, _ = precision_recall_curve( + y_true, probas[:, i], pos_label=img_class) + color = plt.cm.get_cmap(cmap)(float(i%chunk_size) / chunk_size) + ax[int(i/(chunk_size*2)), int(i%(chunk_size*2) > chunk_size)].plot(recall, precision, lw=2, + label='{0} ' + '(area = {1:0.3f})'.format(class_names[int(img_class)], + average_precision), + color=color) + + if plot_micro: + precision, recall, _ = precision_recall_curve( + binarized_y_true.ravel(), probas.ravel()) + average_precision = average_precision_score(binarized_y_true, + probas, + average='micro') + ax[int(FLAGS.chunks/2)-1,1].plot(recall, precision, + label='micro-average PR ' + '(area = {0:0.3f})'.format(average_precision), + color='navy', linestyle=':', linewidth=4) + + for x in range(int(FLAGS.chunks/2)): + for y in range(2): + ax[x,y].set_xlim([0.0, 1.0]) + ax[x,y].set_ylim([0.0, 1.05]) + ax[x,y].set_xlabel('Recall') + ax[x,y].set_ylabel('Precision') + ax[x,y].tick_params(labelsize=text_fontsize) + ax[x,y].legend(loc='lower left', fontsize=text_fontsize) + return ax + +plot_precision_recall(y_true, y_proba, text_fontsize="xx-small", classes_to_plot=[3,16,41,70,77,82]) +plt.show() diff --git a/util/splitter b/util/splitter new file mode 100755 index 0000000..0373669 --- /dev/null +++ b/util/splitter @@ -0,0 +1,30 @@ +#!/usr/bin/perl +my $target; +my $md5_hash; +my $png; +my $count = 0; + + +while (<>){ + if (/(.*),,,vas,,,(.*),,,vas,,,(.*)/) { + if($target){ + mkdir "images-man/$target" unless -d "images-man/$target"; + open(my $fh, '>', "images-man/$target/$target-$md5_hash.png") or die "could not write"; + print $fh $png; + close $fh; + } + $count++; + $target = $1; + $md5_hash = $2; + $png = $3."\n"; + } else { + $png.=$_; + } +} + +mkdir "images-man/$target" unless -d "images-man/$target"; +open(my $fh, '>', "images-man/$target/$target-$md5_hash.png") or die; +print $fh $png; +close $fh; + +print($count, " images written\n"); diff --git a/util/splitter-man b/util/splitter-man new file mode 100755 index 0000000..fbdef17 --- /dev/null +++ b/util/splitter-man @@ -0,0 +1,30 @@ +#!/usr/bin/perl +my $target; +my $md5_hash; +my $png; +my $count = 0; + + +while (<>){ + if (/(.*),,,vas,,,(.*),,,vas,,,(.*)/) { + if($target){ + mkdir "images/$target" unless -d "images/$target"; + open(my $fh, '>', "images/$target/$target-$md5_hash.png") or die "could not write"; + print $fh $png; + close $fh; + } + $count++; + $target = $1; + $md5_hash = $2; + $png = $3."\n"; + } else { + $png.=$_; + } +} + +mkdir "images/$target" unless -d "images/$target"; +open(my $fh, '>', "images/$target/$target-$md5_hash.png") or die; +print $fh $png; +close $fh; + +print($count, " images written\n"); diff --git a/util/test.dump b/util/test.dump new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/util/test.dump |