Source code for collie.model.base.trainer

import sys
from typing import Optional, Tuple, Union
import warnings

from pytorch_lightning import Trainer
try:
    from pytorch_lightning.utilities.model_summary import ModelSummary
except ImportError:  # compatible with old ``ModelSummary`` API used in versions prior to ``1.5``
    from pytorch_lightning.core.memory import ModelSummary
try:
    from pytorch_lightning.loggers.logger import Logger as LightningLoggerBase
except ImportError:  # compatible with old ``LightningLoggerBase`` used in versions prior to ``1.6``
    from pytorch_lightning.loggers.base import LightningLoggerBase
from pytorch_lightning.utilities import move_data_to_device
import torch
from tqdm.auto import tqdm

from collie.model.base.base_pipeline import BasePipeline
from collie.model.base.layers import MultiLRScheduler, MultiOptimizer


[docs]class CollieTrainer(Trainer): """ Helper wrapper class around PyTorch Lightning's ``Trainer`` class. Specifically, this wrapper: * Checks if a model has a validation dataset passed in (under the ``val_loader`` attribute) and, if not, sets ``num_sanity_val_steps`` to 0 and ``check_val_every_n_epoch`` to ``sys.maxint``. * Checks if a GPU is available and, if ``gpus is None``, sets ``gpus = -1``. See ``pytorch_lightning.Trainer`` documentation for more details at: https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api Compared with ``CollieMinimalTrainer``, PyTorch Lightning's ``Trainer`` offers more flexibility and room for exploration, at the cost of a higher training time (which is especially true for larger models). We recommend starting all model exploration with this ``CollieTrainer`` (callbacks, automatic Lightning optimizations, etc.), finding a set of hyperparameters that work for your training job, then using this in the simpler but faster ``CollieMinimalTrainer``. Parameters ---------- model: collie.model.BasePipeline Initialized Collie model max_epochs: int Stop training once this number of epochs is reached benchmark: bool If set to ``True``, enables ``cudnn.benchmark`` deterministic: bool If set to ``True``, enables ``cudnn.deterministic`` **kwargs: keyword arguments Additional keyword arguments to be sent to the ``Trainer`` class: https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api Original ``pytorch_lightning.Trainer`` docstring as follows: ######## """ def __init__(self, model: torch.nn.Module, max_epochs: int = 10, benchmark: bool = True, deterministic: bool = True, **kwargs): if not hasattr(model, 'val_loader') or model.val_loader is None: print('Did not detect ``val_loader``. Setting ``num_sanity_val_steps`` to 0.') kwargs['num_sanity_val_steps'] = 0 kwargs['check_val_every_n_epoch'] = sys.maxsize if kwargs.get('gpus') is None and torch.cuda.is_available(): print('Detected GPU. Setting ``gpus`` to 1.') kwargs['gpus'] = 1 kwargs['max_epochs'] = max_epochs kwargs['benchmark'] = benchmark kwargs['deterministic'] = deterministic super().__init__(**kwargs) __doc__ += Trainer.__init__.__doc__ @property def max_epochs(self): """ Property that just returns ``max_epochs``, included only so we can have a setter for it without an ``AttributeError``. """ try: return self.fit_loop.max_epochs except AttributeError: # compatible with old Pytorch Lightning ``Trainer`` API prior to version ``1.4.0`` return self._max_epochs @max_epochs.setter def max_epochs(self, value: int): """ Set the ``max_epochs`` attribute to ``value``. Parameters ---------- value: int Value to set ``max_epochs`` attribute to """ try: self.fit_loop.max_epochs = value except AttributeError: # compatible with old Pytorch Lightning ``Trainer`` API prior to version ``1.4.0`` self._max_epochs = value
[docs]class CollieMinimalTrainer(): """ A more manual implementation of PyTorch Lightning's ``Trainer`` class, attempting to port over the most commonly used ``Trainer`` arguments into a training loop with more transparency and faster training times. Through extensive experimentation, we found that PyTorch Lightning's ``Trainer`` was training Collie models about 25% slower than the more manual, typical PyTorch training loop boilerplate. Thus, we created the ``CollieMinimalTrainer``, which shares a similar API to PyTorch Lightning's ``Trainer`` object (both in instantiation and in usage), with a standard PyTorch training loop in its place. While PyTorch Lightning's ``Trainer`` offers more flexibility and customization through the addition of the additional ``Trainer`` arguments and ``callbacks``, we designed this class as a way to train a model in production, where we might be more focused on faster training times and less on hyperparameter tuning and R&D, where one might instead opt to use PyTorch Lightning's ``Trainer`` class. Note that the arguments the ``CollieMinimalTrainer`` trainer accepts will be slightly different than the ones that the ``CollieTrainer`` accept, and defaults are also not guaranteed to be equal as the two libraries evolve. Notable changes are: * If ``gpus > 1``, only a single GPU will be used and any other GPUs will remain unused. Multi- GPU training is not supported in ``CollieMinimalTrainer`` at this time. * ``logger == True`` has no meaning in ``CollieMinimalTrainer`` - a default logger will NOT be created if set to ``True``. * There is no way to pass in ``callbacks`` at this time. Instead, we will implement the most used ones during training here, manually, in favor of greater speed over customization. To use early stopping, set the ``early_stopping_patience`` to an integer other than ``None``. .. code-block:: python from collie.model import CollieMinimalTrainer, MatrixFactorizationModel # notice how similar the usage is to the standard ``CollieTrainer`` model = MatrixFactorizationModel(train=train) trainer = CollieMinimalTrainer(model) trainer.fit(model) Model results should NOT be significantly different whether trained with ``CollieTrainer`` or ``CollieMinimalTrainer``. If there's an argument you would like to see added to ``CollieMinimalTrainer`` that is present in ``CollieTrainer`` used during productionalized model training, make an Issue or a PR in GitHub! Parameters ---------- model: collie.model.BasePipeline Initialized Collie model max_epochs: int Stop training once this number of epochs is reached gpus: bool or int Whether to train on the GPU (``gpus == True`` or ``gpus > 0``) or the CPU logger: LightningLoggerBase Logger for experiment tracking. Set ``logger = None`` or ``logger = False`` to disable logging early_stopping_patience: int Number of epochs of patience to have without any improvement in loss before stopping training early. Validation epoch loss will be used if there is a validation DataLoader present, else training epoch loss will be used. Set ``early_stopping_patience = None`` or ``early_stopping_patience = False`` to disable early stopping log_every_n_steps: int How often to log within steps, if ``logger`` is enabled flush_logs_every_n_steps: int How often to flush logs to disk, if ``logger`` is enabled enable_model_summary: bool Whether to enable or disable the model summarization weights_summary: str Deprecated, replaced with ``enable_model_summary``. Prints summary of the weights when training begins detect_anomaly: bool Context-manager that enable anomaly detection for the autograd engine. This does two things: * Running the forward pass with detection enabled will allow the backward pass to print the traceback of the forward operation that created the failing backward function. * Any backward computation that generate “nan” value will raise an error. Warning: This mode should be enabled only for debugging as the different tests will slow down your program execution. terminate_on_nan: bool Deprecated, replaced with ``detect_anomaly``. If set to ``True``, will terminate training (by raising a ``ValueError``) at the end of each training batch, if any of the parameters or the loss are NaN or +/- infinity benchmark: bool If set to ``True``, enables ``cudnn.benchmark`` deterministic: bool If set to ``True``, enables ``cudnn.deterministic`` progress_bar_refresh_rate: int How often to refresh progress bar (in steps), if ``verbosity > 0`` verbosity: Union[bool, int] How verbose to be in training. * ``0`` disables all printouts, including ``weights_summary`` * ``1`` prints ``weights_summary`` (if applicable) and epoch losses * ``2`` prints ``weights_summary`` (if applicable), epoch losses, and progress bars """ def __init__(self, model: BasePipeline, max_epochs: int = 10, gpus: Optional[Union[bool, int]] = None, logger: Optional[LightningLoggerBase] = None, early_stopping_patience: Optional[int] = 3, log_every_n_steps: int = 50, flush_logs_every_n_steps: int = 100, enable_model_summary: bool = True, weights_summary: Optional[str] = None, detect_anomaly: bool = False, terminate_on_nan: Optional[bool] = None, benchmark: bool = True, deterministic: bool = True, progress_bar_refresh_rate: Optional[int] = None, verbosity: Union[bool, int] = True): # some light argument validation before saving as class-level attributes if gpus is None and torch.cuda.is_available(): print('Detected GPU. Setting ``gpus`` to 1.') gpus = 1 if logger is False: logger = None if early_stopping_patience is False: early_stopping_patience = None if verbosity is True: verbosity = 2 elif verbosity is False: verbosity = 0 if weights_summary is not None: warnings.warn( '``weights_summary`` is deprecated and is replaced with ``enable_model_summary``.', DeprecationWarning ) if terminate_on_nan is not None: warnings.warn( '``terminate_on_nan`` is deprecated and is replaced with ``detect_anomaly``.', DeprecationWarning ) if detect_anomaly is False: detect_anomaly = terminate_on_nan self.max_epochs = max_epochs self.gpus = gpus self.benchmark = benchmark self.deterministic = deterministic self.logger = logger self.early_stopping_patience = early_stopping_patience self.log_every_n_steps = log_every_n_steps self.flush_logs_every_n_steps = flush_logs_every_n_steps self.enable_model_summary = enable_model_summary self.weights_summary = weights_summary self.detect_anomaly = detect_anomaly self.terminate_on_nan = terminate_on_nan self.progress_bar_refresh_rate = progress_bar_refresh_rate self.verbosity = verbosity self.best_epoch_loss = (0, sys.maxsize) self.train_steps = 0 self.val_steps = 0 self.num_epochs_completed = 0 if self.gpus is None or self.gpus is False or self.gpus == 0: self.device = 'cpu' else: self.device = 'cuda' torch.backends.cudnn.benchmark = self.benchmark torch.backends.cudnn.deterministic = self.deterministic @property def max_epochs(self): """ Property that just returns ``max_epochs``, included only so we can have a setter for it without an ``AttributeError``. """ return self._max_epochs @max_epochs.setter def max_epochs(self, value: int): """ Set the ``max_epochs`` attribute to ``value``. Parameters ---------- value: int Value to set ``max_epochs`` attribute to """ self._max_epochs = value
[docs] def fit(self, model: BasePipeline) -> None: """ Runs the full optimization routine. Parameters ---------- model: collie.model.BasePipeline Initialized Collie model """ if ( not hasattr(self, 'first_run_pre_training_setup_complete_') or not self.first_run_pre_training_setup_complete_ ): self._pre_training_setup(model=model) self.first_run_pre_training_setup_complete_ = True self._initialize_optimizers_and_lr_schedulers(model=model) with torch.autograd.set_detect_anomaly(self.detect_anomaly): self._fit(model)
def _fit(self, model: BasePipeline) -> None: # set up top-level epoch progress bar epoch_iterator = range(self.num_epochs_completed + 1, self.max_epochs + 1) if self.verbosity >= 2: epoch_iterator = tqdm(epoch_iterator, position=0, unit='epoch', desc='', miniters=self.progress_bar_refresh_rate) for epoch in epoch_iterator: # run the training loop model.train() train_loss = self._train_loop_single_epoch(model, epoch) model.eval() epoch_summary = f'Epoch {epoch: >5}: train loss: {train_loss :<1.5f}' early_stop_loss = train_loss # save epoch loss metrics to the logger if self.logger is not None: self.logger.log_metrics(metrics={'train_loss_epoch': train_loss}, step=epoch) # run the validation loop logic, if we have the ``val_dataloader`` to do so if self.val_dataloader is not None: val_loss = self._val_loop_single_epoch(model) epoch_summary += f', val loss: {val_loss :<1.5f}' early_stop_loss = val_loss if self.logger is not None: self.logger.log_metrics(metrics={'val_loss_epoch': val_loss}, step=epoch) # write out to disk only a single time at the end of the epoch if self.logger is not None: self.logger.save() if self.verbosity >= 1: print(epoch_summary) model.hparams.num_epochs_completed += 1 self.num_epochs_completed += 1 # early stopping logic if ( self.early_stopping_patience is not None and early_stop_loss >= self.best_epoch_loss[1] and epoch >= (self.early_stopping_patience + self.best_epoch_loss[0]) ): print(f'Epoch {epoch :>5}: Early stopping activated.') self._finalize_training() return # save best loss stats for future early stopping logic if early_stop_loss < self.best_epoch_loss[1]: self.best_epoch_loss = (epoch, early_stop_loss) # learning rate scheduler stepping, if applicable if self.lr_scheduler is not None: try: # used for most learning rate schedulers self.lr_scheduler.step() except TypeError: # used for ``ReduceLROnPlateau`` self.lr_scheduler.step(early_stop_loss) # run final logging things when training is complete before returning self._finalize_training() def _pre_training_setup(self, model: BasePipeline) -> None: """Set up DataLoaders, optimizers, learning rate schedulers, etc. before training starts.""" self.train_dataloader = model.train_dataloader() self.val_dataloader = model.val_dataloader() if self.verbosity != 0 and ( self.weights_summary is not None or self.enable_model_summary is True ): try: print(ModelSummary(model, max_depth=int(self.enable_model_summary))) except TypeError: # compatible with old ``ModelSummary`` API used in versions prior to ``1.6`` print(ModelSummary(model, mode=self.weights_summary)) # log model hyperparameters, if applicable if self.logger is not None: self.logger.log_hyperparams(model.hparams) self.logger.save() # move the model over to the device model.to(self.device) model._move_any_external_data_to_device() def _initialize_optimizers_and_lr_schedulers(self, model: BasePipeline) -> None: self.lr_scheduler = None configure_optimizers_return_value = model.configure_optimizers() if isinstance(configure_optimizers_return_value, tuple): # we have a list of optimizers and a list of lr_schedulers dictionaries optimizers, lr_schedulers = configure_optimizers_return_value self.optimizer = MultiOptimizer(optimizers) self.lr_scheduler = MultiLRScheduler(lr_schedulers) elif isinstance(configure_optimizers_return_value, list): # we have a list of optimizers self.optimizer = MultiOptimizer(configure_optimizers_return_value) elif isinstance(configure_optimizers_return_value, torch.optim.Optimizer): # we have a single optimizer self.optimizer = MultiOptimizer([configure_optimizers_return_value]) else: # we have something we've never seen before raise ValueError('Unexpected output from ``model.configure_optimizers()``!') def _train_loop_single_epoch(self, model: torch.nn.Module, epoch: int) -> float: """Training loop for a single epoch, where gradients are optimized for.""" total_loss = 0 train_dataloader_iterator = enumerate(self.train_dataloader) if self.verbosity >= 2: train_dataloader_iterator = tqdm(train_dataloader_iterator, total=len(self.train_dataloader), unit='step', desc=f'({epoch :^5})', leave=False, miniters=self.progress_bar_refresh_rate) for batch_idx, batch in train_dataloader_iterator: self.optimizer.zero_grad() batch = self._move_batch_to_device(batch) loss = model.calculate_loss(batch) loss.backward() for optimizer_idx, optimizer in enumerate(self.optimizer.optimizers): model.optimizer_step(epoch=epoch, batch_idx=batch_idx, optimizer=optimizer, optimizer_idx=optimizer_idx, optimizer_closure=None) self.train_steps += 1 detached_loss = loss.detach() total_loss += detached_loss if self.verbosity >= 2: train_dataloader_iterator.set_postfix(train_loss=detached_loss.item()) self._log_step(name='train', steps=self.train_steps, total_loss=total_loss, batch_idx=batch_idx) return (total_loss / len(self.train_dataloader)).item() def _val_loop_single_epoch(self, model: torch.nn.Module) -> float: """Validation loop for a single epoch, where gradients are NOT optimized for.""" total_loss = 0 for batch_idx, batch in enumerate(self.val_dataloader): batch = self._move_batch_to_device(batch) loss = model.calculate_loss(batch) self.val_steps += 1 total_loss += loss.detach() self._log_step(name='val', steps=self.val_steps, total_loss=total_loss, batch_idx=batch_idx) return (total_loss / len(self.val_dataloader)).item() def _move_batch_to_device( self, batch: Tuple[Tuple[torch.tensor, torch.tensor], torch.tensor], ) -> Tuple[Tuple[torch.tensor, torch.tensor], torch.tensor]: """Move a batch of data to the proper device.""" # TODO: does this actually speed anything up? try: # assume we have implicit data ((users, pos_items), neg_items) = batch users = users.to(self.device) pos_items = pos_items.to(self.device) neg_items = neg_items.to(self.device) return ((users, pos_items), neg_items) except (AttributeError, ValueError): try: # now assume we have explicit data users, pos_items, ratings = batch users = users.to(self.device) pos_items = pos_items.to(self.device) ratings = ratings.to(self.device) return users, pos_items, ratings except (AttributeError, ValueError): # we have an unexpected data format, fallback to PyTorch Lightning return move_data_to_device(batch, self.device) def _log_step(self, name: str, steps: int, total_loss: torch.tensor, batch_idx: int) -> None: """Check if we should and, if so, log step-loss metrics to our logger.""" if self.logger is not None: if steps % self.log_every_n_steps == 0: batch_loss = (total_loss / (batch_idx + 1)).item() self.logger.log_metrics(metrics={f'{name}_loss_step': batch_loss}, step=steps) if steps % self.flush_logs_every_n_steps == 0: self.logger.save() def _finalize_training(self) -> None: """Finalize logging results before returning.""" if self.logger is not None: self.logger.save() self.logger.finalize(status='FINISHED')