#!/usr/bin/python3 # Copyright 2018 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== r"""DenseNet implemented with Keras running on Cloud TPUs. This file shows how you can run DenseNet on a Cloud TPU using the TensorFlow Keras support. This is configured for ImageNet (e.g. 1000 classes), but you can easily adapt to your own datasets by changing the code appropriately. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os from absl import app from absl import flags from absl import logging import numpy as np import tensorflow as tf import eval_utils import imagenet_input from models.densenet import DenseNetImageNet121 from tensorflow.python.keras import backend as K from tensorflow.python.keras.optimizer_v2 import gradient_descent, adam try: import h5py as _ # pylint: disable=g-import-not-at-top HAS_H5PY = True except ImportError: logging.warning('`h5py` is not installed. Please consider installing it ' 'to save weights for long-running training.') HAS_H5PY = False # Imagenet training and test data sets. IMAGE_WIDTH = 320 IMAGE_HEIGHT = 240 EPOCHS = 90 # Standard imagenet training regime. # Training hyperparameters. NUM_CORES = 8 PER_CORE_BATCH_SIZE = 64 CPU_BATCH_SIZE = 4 BATCH_SIZE = NUM_CORES * PER_CORE_BATCH_SIZE BASE_LEARNING_RATE = 0.4 # Learning rate schedule LR_SCHEDULE = [ # (multiplier, epoch to start) tuples (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80) ] DEFAULT_WEIGHTS_H5 = 'resnet50_weights.h5' DEFAULT_LOG_DIR = '/tmp/netcraft' DEFAULT_BUCKET = 'gs://netcraft/' flags.DEFINE_integer('epochs', EPOCHS, '') flags.DEFINE_string('weights', None, 'Use saved weights') flags.DEFINE_string('bucket', DEFAULT_BUCKET, 'Bucket to use') flags.DEFINE_string('tpu', None, 'Name of the TPU to use.') flags.DEFINE_string('data', None, 'Path to training and testing data.') flags.DEFINE_string( 'log', DEFAULT_LOG_DIR, ('The directory where the model weights and training/evaluation summaries ' 'are stored. If not specified, save to /tmp/netcraft.')) flags.DEFINE_bool( 'complete_eval', True, 'Eval both top 1 and top 5 accuracy. Otherwise, only eval top 1 accuracy. ' 'Furthemore generate confusion matrixes and save softmax values in log_dir') flags.DEFINE_bool('evalonly', False, 'Only run eval with given weights, do not train') FLAGS = flags.FLAGS def learning_rate_schedule(current_epoch, current_batch): """Handles linear scaling rule, gradual warmup, and LR decay. The learning rate starts at 0, then it increases linearly per step. After 5 epochs we reach the base learning rate (scaled to account for batch size). After 30, 60 and 80 epochs the learning rate is divided by 10. After 90 epochs training stops and the LR is set to 0. This ensures that we train for exactly 90 epochs for reproducibility. Args: current_epoch: integer, current epoch indexed from 0. current_batch: integer, current batch in the current epoch, indexed from 0. Returns: Adjusted learning rate. """ return 0.0 epoch = current_epoch + float(current_batch) / train_steps_per_epoch warmup_lr_multiplier, warmup_end_epoch = LR_SCHEDULE[0] if epoch < warmup_end_epoch: # Learning rate increases linearly per step. return BASE_LEARNING_RATE * warmup_lr_multiplier * epoch / warmup_end_epoch for mult, start_epoch in LR_SCHEDULE: if epoch >= start_epoch: learning_rate = BASE_LEARNING_RATE * mult else: break return learning_rate class LearningRateBatchScheduler(tf.keras.callbacks.Callback): """Callback to update learning rate on every batch (not epoch boundaries). N.B. Only support Keras optimizers, not TF optimizers. Args: schedule: a function that takes an epoch index and a batch index as input (both integer, indexed from 0) and returns a new learning rate as output (float). """ def __init__(self, schedule): super(LearningRateBatchScheduler, self).__init__() self.schedule = schedule self.epochs = -1 self.prev_lr = -1 def on_epoch_begin(self, epoch, logs=None): if not hasattr(self.model.optimizer, 'lr'): raise ValueError('Optimizer must have a "lr" attribute.') self.epochs += 1 def on_batch_begin(self, batch, logs=None): lr = self.schedule(self.epochs, batch) if not isinstance(lr, (float, np.float32, np.float64)): raise ValueError('The output of the "schedule" function should be float.') if lr != self.prev_lr: K.set_value(self.model.optimizer.lr, lr) self.prev_lr = lr logging.debug('Epoch %05d Batch %05d: LearningRateBatchScheduler change ' 'learning rate to %s.', self.epochs, batch, lr) def main(argv): dinfo = np.load(os.path.join(FLAGS.data, 'dinfo.npz')) classes = dinfo['classes'] NUM_CLASSES = len(classes) train_cnt = dinfo['train_cnt'] # 1141 # 50273 # Approximate number of images. val_cnt = dinfo['val_cnt'] # 488 # 12560 # Number of images. train_steps_per_epoch = int(train_cnt / BATCH_SIZE) val_steps = int(val_cnt // BATCH_SIZE ) print("Using", train_cnt, "training images and", val_cnt, "for testing") logging.info('Building Keras DenseNet model') model = DenseNetImageNet121(classes=NUM_CLASSES, weights=None) if FLAGS.tpu: logging.info('Converting from CPU to TPU model.') resolver = tf.contrib.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) strategy = tf.contrib.tpu.TPUDistributionStrategy(resolver) model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy) logging.info('Compiling model.') model.compile( optimizer=gradient_descent.SGD(learning_rate=BASE_LEARNING_RATE, momentum=0.9, nesterov=True), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) if FLAGS.data is None: training_images = np.random.randn( BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, 3).astype(np.float32) training_labels = np.random.randint(NUM_CLASSES, size=BATCH_SIZE, dtype=np.int32) logging.info('Training model using synthetica data.') model.fit( training_images, training_labels, epochs=EPOCHS, batch_size=BATCH_SIZE) logging.info('Evaluating the model on synthetic data.') model.evaluate(training_images, training_labels, verbose=0) else: per_core_batch_size = PER_CORE_BATCH_SIZE if FLAGS.tpu else CPU_BATCH_SIZE imagenet_train = imagenet_input.ImageNetInput( is_training=True, data_dir=FLAGS.bucket+FLAGS.data if FLAGS.tpu else FLAGS.data, per_core_batch_size=per_core_batch_size) logging.info('Training model using real data in directory "%s".', FLAGS.data) # If evaluating complete_eval, we feed the inputs from a Python generator, # so we need to build a single batch for all of the cores, which will be # split on TPU. per_core_batch_size = ( BATCH_SIZE if FLAGS.complete_eval else PER_CORE_BATCH_SIZE) imagenet_validation = imagenet_input.ImageNetInput( is_training=False, data_dir=FLAGS.bucket+FLAGS.data if FLAGS.tpu else FLAGS.data, per_core_batch_size=per_core_batch_size) eval_callback = eval_utils.TensorBoardWithValidation( log_dir=FLAGS.log, validation_imagenet_input=imagenet_validation, validation_steps=val_steps, validation_epochs=[ 3, 10, 30, 60, 90], write_images=True, write_graph=True, plot_wrong=True, plot_cm=True, plot_pr=True, classes=classes, complete_eval=FLAGS.complete_eval) callbacks = [ LearningRateBatchScheduler(schedule=learning_rate_schedule), eval_callback ] if FLAGS.tpu: model_in = imagenet_train.input_fn else: model_in = imagenet_train.input_fn() if FLAGS.weights: weights_file = os.path.join(FLAGS.weights) logging.info('Loading model and weights from %s', weights_file) model.load_weights(weights_file) else: weights_file = os.path.join(DEFAULT_WEIGHTS_H5) if FLAGS.evalonly: eval_callback.set_model(model) eval_callback.on_epoch_end(420) else: model.fit(model_in, epochs=EPOCHS, steps_per_epoch=train_steps_per_epoch, callbacks=callbacks) logging.info('Saving weights into %s', weights_file) model.save_weights(weights_file, overwrite=True) if __name__ == '__main__': tf.logging.set_verbosity(tf.logging.INFO) app.run(main)