Source code for collie.model.hybrid_matrix_factorization

from functools import partial
import os
from pathlib import Path
from typing import Callable, Dict, Iterable, List, Optional, Union
import warnings

import joblib
import numpy as np
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau

from collie.config import DATA_PATH
from collie.interactions import (ApproximateNegativeSamplingInteractionsDataLoader,
from collie.model.base import MultiStagePipeline, ScaledEmbedding, ZeroEmbedding
from collie.utils import get_init_arguments, merge_docstrings

INTERACTIONS_LIKE_INPUT = Union[ApproximateNegativeSamplingInteractionsDataLoader,

[docs]class HybridModel(MultiStagePipeline): # NOTE: the full docstring is merged in with ``MultiStagePipeline``'s using # ``merge_docstrings``. Only the description of new or changed parameters are included in this # docstring """ Training pipeline for a multi-stage hybrid recommendation model. ``HybridModel`` models contain dense layers that process item and/or user metadata, concatenate this embedding with user and item embeddings, sending this concatenated embedding through more dense layers to output a single float ranking / rating. We add both user and item biases to this score before returning. This is the same architecture as the ``HybridPretrainedModel``, but we are training the embeddings ourselves rather than relying on pulling this from a pre-trained model. The stages in a ``HybridModel`` depend on whether both item and user metadata is used. For the full model, they are, in order: 1. ``matrix_factorization`` Matrix factorization exactly as we do in ``MatrixFactorizationModel``. In this stage, metadata is NOT incorporated into the model. 2. ``metadata_only`` User and item embeddings terms are frozen, and the MLP layers for the metadata (if specified) and combined embedding-metadata data are optimized. 3. ``all`` Embedding and MLP layers are all optimized together, including those for metadata. All ``HybridModel`` instances are subclasses of the ``LightningModule`` class provided by PyTorch Lightning. This means to train a model, you will need a ``collie.model.CollieTrainer`` object, but the model can be saved and loaded without this ``Trainer`` instance. Example usage may look like: .. code-block:: python from collie.model import CollieTrainer, HybridModel # instantiate and fit a ``HybridModel`` as expected model = HybridModel(train=train, item_metadata=item_metadata, user_metadata=user_metadata) trainer = CollieTrainer(model) # train for X more epochs on the next stage, ``metadata_only`` trainer.max_epochs += X model.advance_stage() # train for Y more epochs on the next stage, ``all`` trainer.max_epochs += Y model.advance_stage() model.eval() # do evaluation as normal with ``model`` model.save_model(path='model') new_model = HybridModel(load_model_path='model') # do evaluation as normal with ``new_model`` Note ---- The ``forward`` calculation will be different depending on the stage that is set. Note this when evaluating / saving and loading models in. Parameters ---------- item_metadata: torch.tensor, pd.DataFrame, or np.array, 2-dimensional The shape of the item metadata should be (num_items x metadata_features), and each item's metadata should be available when indexing a row by an item ID user_metadata: torch.tensor, pd.DataFrame, or np.array, 2-dimensional The shape of the user metadata should be (num_users x metadata_features), and each user's metadata should be available when indexing a row by a user ID embedding_dim: int Number of latent factors to use for user and item embeddings item_metadata_layers_dims: list List of linear layer dimensions to apply to the item metadata only, starting with the dimension directly following ``item_metadata_features`` and ending with the dimension to concatenate with the item embeddings user_metadata_layers_dims: list List of linear layer dimensions to apply to the user metadata only, starting with the dimension directly following ``user_metadata_features`` and ending with the dimension to concatenate with the user embeddings combined_layers_dims: list List of linear layer dimensions to apply to the concatenated item embeddings and item metadata, starting with the dimension directly following the shape of ``item_embeddings + metadata_features`` and ending with the dimension before the final linear layer to dimension 1 dropout_p: float Probability of dropout metadata_only_stage_lr: float Learning rate for metadata and combined layers optimized during the ``metadata_only`` stage all_stage_lr: float Learning rate for all model parameters optimized during the ``all`` stage optimizer: torch.optim or str Optimizer used for embeddings and bias terms (if ``bias_optimizer`` is ``None``) during the ``matrix_factorization`` stage. If a string, one of the following supported optimizers: * ``'sgd'`` (for ``torch.optim.SGD``) * ``'adam'`` (for ``torch.optim.Adam``) metadata_only_stage_optimizer: torch.optim or str Optimizer used for metadata and combined layers during the ``metadata_only`` stage. If a string, one of the following supported optimizers: * ``'sgd'`` (for ``torch.optim.SGD``) * ``'adam'`` (for ``torch.optim.Adam``) all_stage_optimizer: torch.optim or str Optimizer used for all model parameters during the ``all`` stage. If a string, one of the following supported optimizers: * ``'sgd'`` (for ``torch.optim.SGD``) * ``'adam'`` (for ``torch.optim.Adam``) """ def __init__(self, train: INTERACTIONS_LIKE_INPUT = None, val: INTERACTIONS_LIKE_INPUT = None, item_metadata: Union[torch.tensor, pd.DataFrame, np.array] = None, user_metadata: Union[torch.tensor, pd.DataFrame, np.array] = None, embedding_dim: int = 30, item_metadata_layers_dims: Optional[List[int]] = None, user_metadata_layers_dims: Optional[List[int]] = None, combined_layers_dims: List[int] = [128, 64, 32], dropout_p: float = 0.0, lr: float = 1e-3, bias_lr: Optional[Union[float, str]] = 1e-2, metadata_only_stage_lr: float = 1e-3, all_stage_lr: float = 1e-4, lr_scheduler_func: Optional[torch.optim.lr_scheduler._LRScheduler] = partial( ReduceLROnPlateau, patience=1, verbose=False, ), weight_decay: float = 0.0, optimizer: Union[str, torch.optim.Optimizer] = 'adam', bias_optimizer: Optional[Union[str, torch.optim.Optimizer]] = 'sgd', metadata_only_stage_optimizer: Union[str, torch.optim.Optimizer] = 'adam', all_stage_optimizer: Union[str, torch.optim.Optimizer] = 'adam', loss: Union[str, Callable[..., torch.tensor]] = 'hinge', metadata_for_loss: Optional[Dict[str, torch.tensor]] = None, metadata_for_loss_weights: Optional[Dict[str, float]] = None, load_model_path: Optional[str] = None, map_location: Optional[str] = None): item_metadata_num_cols = None user_metadata_num_cols = None optimizer_config_list = None if load_model_path is None: if item_metadata is None and user_metadata is None: raise ValueError( 'Must provide item metadata and/or user metadata for ``HybridModel``.' ) if item_metadata is not None: if isinstance(item_metadata, pd.DataFrame): item_metadata = torch.from_numpy(item_metadata.to_numpy()) elif isinstance(item_metadata, np.ndarray): item_metadata = torch.from_numpy(item_metadata) item_metadata = item_metadata.float() item_metadata_num_cols = item_metadata.shape[1] if user_metadata is not None: if isinstance(user_metadata, pd.DataFrame): user_metadata = torch.from_numpy(user_metadata.to_numpy()) elif isinstance(user_metadata, np.ndarray): user_metadata = torch.from_numpy(user_metadata) user_metadata = user_metadata.float() user_metadata_num_cols = user_metadata.shape[1] if bias_optimizer is not None: initial_optimizer_block = [ { 'lr': lr, 'optimizer': optimizer, # optimize embeddings... 'parameter_prefix_list': ['user_embedding', 'item_embedding'], 'stage': 'matrix_factorization', }, { 'lr': lr if bias_lr == 'infer' else bias_lr, 'optimizer': optimizer if bias_optimizer == 'infer' else bias_optimizer, # ... and optimize bias terms too 'parameter_prefix_list': ['user_bias', 'item_bias'], 'stage': 'matrix_factorization', }, ] else: initial_optimizer_block = [ { 'lr': lr, 'optimizer': optimizer, # optimize embeddings and bias terms all together 'parameter_prefix_list': [ 'user_embedding', 'item_embedding', 'user_bias', 'item_bias'], 'stage': 'matrix_factorization', }, ] optimizer_config_list = initial_optimizer_block + [ { 'lr': metadata_only_stage_lr, 'optimizer': metadata_only_stage_optimizer, # optimize metadata layers only 'parameter_prefix_list': [ 'item_metadata', 'user_metadata', 'combined', 'user_bias', 'item_bias' ], 'stage': 'metadata_only', }, { 'lr': all_stage_lr, 'optimizer': all_stage_optimizer, # optimize everything 'parameter_prefix_list': [ 'user', 'item', 'combined' ], 'stage': 'all', }, ] super().__init__(optimizer_config_list=optimizer_config_list, item_metadata_num_cols=item_metadata_num_cols, user_metadata_num_cols=user_metadata_num_cols, **get_init_arguments()) __doc__ = merge_docstrings(MultiStagePipeline, __doc__, __init__) def _move_any_external_data_to_device(self): """Move item and user metadata to the device before training.""" super()._move_any_external_data_to_device() if self.item_metadata is not None: self.item_metadata = if self.user_metadata is not None: self.user_metadata = def _load_model_init_helper(self, load_model_path: str, map_location: str, **kwargs) -> None: super()._load_model_init_helper(load_model_path=os.path.join(load_model_path, 'model.pth'), map_location=map_location, **kwargs) try: self.item_metadata = ( joblib.load(os.path.join(load_model_path, 'item_metadata.pkl')) ) except FileNotFoundError: if self.hparams.item_metadata_layers_dims is not None: warnings.warn('``item_metadata.pkl`` not found') try: self.user_metadata = ( joblib.load(os.path.join(load_model_path, 'user_metadata.pkl')) ) except FileNotFoundError: if self.hparams.user_metadata_layers_dims is not None: warnings.warn('``user_metadata.pkl`` not found') def _configure_metadata_layers( self, metadata_type: str, metadata_layers_dims: Optional[Iterable[int]], num_metadata_cols: Optional[int], ) -> None: """ Configure metadata layers for either item or user data. Parameters ---------- metadata_type: str Metadata type, one of ``user`` or ``item``. It is used to set the attributes ``{metadata_type}_metadata_layers`` and ``{metadata_type}_metadata_layers_dims`` metadata_layers_dims: list List of dimensions for the hidden state of the metadata layers num_metadata_cols: int Number of columns in the metadata dataset """ if metadata_layers_dims is not None: full_metadata_layers_dims = ( [num_metadata_cols] + metadata_layers_dims ) full_metadata_layers = [ nn.Linear(full_metadata_layers_dims[idx - 1], full_metadata_layers_dims[idx]) for idx in range(1, len(full_metadata_layers_dims)) ] setattr(self, f'{metadata_type}_metadata_layers', full_metadata_layers) for i, layer in enumerate(getattr(self, f'{metadata_type}_metadata_layers')): nn.init.xavier_normal_( getattr(self, f'{metadata_type}_metadata_layers')[i].weight ) self.add_module(f'{metadata_type}_metadata_layer_{i}', layer) def _setup_model(self, **kwargs) -> None: """ Method for building model internals that rely on the data passed in. This method will be called after `prepare_data`. """ if self.hparams.load_model_path is None: if 'item_metadata' in kwargs: self.item_metadata = kwargs.pop('item_metadata') if 'user_metadata' in kwargs: self.user_metadata = kwargs.pop('user_metadata') self.user_biases = ZeroEmbedding(num_embeddings=self.hparams.num_users, embedding_dim=1) self.item_biases = ZeroEmbedding(num_embeddings=self.hparams.num_items, embedding_dim=1) self.user_embeddings = ScaledEmbedding(num_embeddings=self.hparams.num_users, embedding_dim=self.hparams.embedding_dim) self.item_embeddings = ScaledEmbedding(num_embeddings=self.hparams.num_items, embedding_dim=self.hparams.embedding_dim) self.dropout = nn.Dropout(p=self.hparams.dropout_p) # set up item metadata-only layers item_metadata_output_dim = self.hparams.item_metadata_num_cols self.item_metadata_layers = None if self.hparams.item_metadata_layers_dims is not None: self._configure_metadata_layers( metadata_type='item', metadata_layers_dims=self.hparams.item_metadata_layers_dims, num_metadata_cols=self.hparams.item_metadata_num_cols, ) item_metadata_output_dim = self.hparams.item_metadata_layers_dims[-1] # set up user metadata-only layers user_metadata_output_dim = self.hparams.user_metadata_num_cols self.user_metadata_layers = None if self.hparams.user_metadata_layers_dims is not None: self._configure_metadata_layers( metadata_type='user', metadata_layers_dims=self.hparams.user_metadata_layers_dims, num_metadata_cols=self.hparams.user_metadata_num_cols, ) user_metadata_output_dim = self.hparams.user_metadata_layers_dims[-1] # set up combined layers depending on metadata inputs if item_metadata_output_dim is not None and user_metadata_output_dim is not None: combined_dimension_input = ( user_metadata_output_dim + self.user_embeddings.embedding_dim + self.item_embeddings.embedding_dim + item_metadata_output_dim ) elif item_metadata_output_dim is not None: combined_dimension_input = ( self.user_embeddings.embedding_dim + self.item_embeddings.embedding_dim + item_metadata_output_dim ) elif user_metadata_output_dim is not None: combined_dimension_input = ( user_metadata_output_dim + self.user_embeddings.embedding_dim + self.item_embeddings.embedding_dim ) combined_layers_dims = [combined_dimension_input] + self.hparams.combined_layers_dims + [1] self.combined_layers = [ nn.Linear(combined_layers_dims[idx - 1], combined_layers_dims[idx]) for idx in range(1, len(combined_layers_dims)) ] for i, layer in enumerate(self.combined_layers): nn.init.xavier_normal_(self.combined_layers[i].weight) self.add_module('combined_layer_{}'.format(i), layer) def _compute_metadata_output( self, metadata_type: str, ids: torch.tensor, ) -> torch.tensor: """ Calculate metadata output for either item or user data. Parameters ---------- metadata_type: str Metadata type, one of ``user`` or ``item`` ids: tensor, 1-d Array of user indices or item indices Returns ------- metadata_output: tensor, 1-d Metadata output """ # TODO: remove self.device and let lightning do it metadata = getattr(self, f'{metadata_type}_metadata') metadata_layers = getattr(self, f'{metadata_type}_metadata_layers') metadata_output = metadata[ids, :].to(self.device) if metadata_layers is not None: for metadata_nn_layer in metadata_layers: metadata_output = self.dropout( F.leaky_relu( metadata_nn_layer(metadata_output) ) ) return metadata_output def _compute_prediction( self, combined_output: torch.tensor, users: torch.tensor, items: torch.tensor, ) -> torch.tensor: """ Calculate prediction output Parameters ---------- combined_output: tensor, 2-d Array of user and item embeddings concatenated with item and/or user metadata users: tensor, 1-d Array of user indices items: tensor, 1-d Array of item indices Returns ------- pred_scores: tensor, 2-d Predicted scores """ for combined_nn_layer in self.combined_layers[:-1]: combined_output = self.dropout( F.leaky_relu( combined_nn_layer(combined_output) ) ) pred_scores = ( self.combined_layers[-1](combined_output) + self.user_biases(users) + self.item_biases(items) ) return pred_scores
[docs] def forward(self, users: torch.tensor, items: torch.tensor) -> torch.tensor: """ Forward pass through the model. Parameters ---------- users: tensor, 1-d Array of user indices items: tensor, 1-d Array of item indices Returns ------- preds: tensor, 1-d Predicted ratings or rankings """ if self.hparams.stage == 'matrix_factorization': pred_scores = ( torch.mul( self.dropout(self.user_embeddings(users)), self.dropout(self.item_embeddings(items)) ).sum(axis=1) + self.user_biases(users).squeeze(1) + self.item_biases(items).squeeze(1) ) elif self.hparams.stage in ('metadata_only', 'all') and self.user_metadata is None: item_metadata_output = self._compute_metadata_output( metadata_type='item', ids=items ) # TODO: make this matrix factorization instead of only a MLP combined_output =, self.item_embeddings(items), item_metadata_output), 1) pred_scores = self._compute_prediction(combined_output, users, items) elif self.hparams.stage in ('metadata_only', 'all') and self.item_metadata is None: user_metadata_output = self._compute_metadata_output( metadata_type='user', ids=users ) # TODO: make this matrix factorization instead of only a MLP combined_output =, self.user_embeddings(users), self.item_embeddings(items)), 1) pred_scores = self._compute_prediction(combined_output, users, items) else: user_metadata_output = self._compute_metadata_output( metadata_type='user', ids=users ) item_metadata_output = self._compute_metadata_output( metadata_type='item', ids=items ) # TODO: make this matrix factorization instead of only a MLP combined_output =, self.user_embeddings(users), self.item_embeddings(items), item_metadata_output), 1) pred_scores = self._compute_prediction(combined_output, users, items) return pred_scores.squeeze()
def _get_item_embeddings(self) -> torch.tensor: """Get item embeddings on device.""" # TODO: update this to get the embeddings post-MLP return def _get_user_embeddings(self) -> torch.tensor: """Get user embeddings on device.""" # TODO: update this to get the embeddings post-MLP return
[docs] def save_model(self, path: Union[str, Path] = os.path.join(DATA_PATH / 'model'), overwrite: bool = False) -> None: """ Save the model's state dictionary, hyperparameters, and user and/or item metadata. While PyTorch Lightning offers a way to save and load models, there are two main reasons for overriding these: 1) To properly save and load a model requires the ``Trainer`` object, meaning that all deployed models will require Lightning to run the model, which is not actually needed for inference. 2) In the v0.8.4 release, loading a model back in leads to a ``RuntimeError`` unable to load in weights. Parameters ---------- path: str or Path Directory path to save model and data files overwrite: bool Whether or not to overwrite existing data """ path = str(path) if os.path.exists(path): if os.listdir(path) and overwrite is False: raise ValueError(f'Data exists in ``path`` at {path} and ``overwrite`` is False.') Path(path).mkdir(parents=True, exist_ok=True) if self.item_metadata is not None: joblib.dump(self.item_metadata, os.path.join(path, 'item_metadata.pkl')) if self.user_metadata is not None: joblib.dump(self.user_metadata, os.path.join(path, 'user_metadata.pkl')) super().save_model(filename=os.path.join(path, 'model.pth'))