Source code for collie.interactions.dataloaders

import textwrap
from typing import Iterable, Optional, Union

import numpy as np
from scipy.sparse import coo_matrix
import torch

from collie.interactions.datasets import (BaseInteractions,
                                          ExplicitInteractions,
                                          HDF5Interactions,
                                          Interactions)
from collie.interactions.samplers import ApproximateNegativeSampler, HDF5Sampler


class BaseInteractionsDataLoader(torch.utils.data.DataLoader):
    """
    A base class acting as a wrapper around a ``torch.utils.data.DataLoader`` for
    ``Interactions``-type datasets. This class should only be inherited from and not used for model
    training.

    Parameters
    ----------
    interactions: Interactions or HDF5Interactions
    **kwargs: keyword arguments
        Keyword arguments passed into ``torch.utils.data.DataLoader.__init__``:
        https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader

    Original ``torch.utils.data.DataLoader`` docstring as follows:
    ########
    """
    def __init__(self,
                 interactions: Union[Interactions, HDF5Interactions] = None,
                 **kwargs):

        super().__init__(
            dataset=interactions,
            **kwargs,
        )

        self.interactions = interactions

    __doc__ += torch.utils.data.DataLoader.__doc__

    @property
    def num_users(self) -> int:
        """Number of users in ``interactions``."""
        return self.interactions.num_users

    @property
    def num_items(self) -> int:
        """Number of items in ``interactions``."""
        return self.interactions.num_items

    @property
    def num_negative_samples(self) -> int:
        """Number of negative samples in ``interactions``."""
        return self.interactions.num_negative_samples

    @property
    def num_interactions(self) -> int:
        """Number of interactions in ``interactions``."""
        return self.interactions.num_interactions

    @property
    def mat(self) -> coo_matrix:
        """Sparse COO matrix of ``interactions``."""
        return self.interactions.mat


[docs]class InteractionsDataLoader(BaseInteractionsDataLoader): """ A light wrapper around a ``torch.utils.data.DataLoader`` for ``Interactions``-type datasets. For implicit data, batches will be created one-point-at-a-time using exact negative sampling (unless configured not to in ``interactions``), which is optimal when datasets are smaller (< 1M+ interactions) and model training speed is not a concern. This is the default ``DataLoader`` for ``Interactions`` datasets. For explicit data, negative sampling is not used, but batches will still be created one-point-at-a-time. Parameters ---------- interactions: BaseInteractions If not provided, an ``Interactions`` object will be created with ``mat`` or all of ``users``, ``items``, and ``ratings`` mat: scipy.sparse.coo_matrix or numpy.array, 2-dimensional If ``interactions is None``, will be used instead of ``users``, ``items``, and ``ratings`` arguments to create an ``Interactions`` object users: Iterable[int], 1-d If ``interactions is None and mat is None``, array of user IDs, starting at 0 items: Iterable[int], 1-d If ``interactions is None and mat is None``, array of corresponding item IDs to ``users``, starting at 0 ratings: Iterable[int], 1-d If ``interactions is None and mat is None``, array of corresponding ratings to both ``users`` and ``items``. If ``None``, will default to each user in ``user`` interacting with an item with a rating value of 1 batch_size: int Number of samples per batch to load shuffle: bool Whether to shuffle the order of data returned or not. This is especially useful for training data to ensure the model does not overfit to a specific order of data **kwargs: keyword arguments Relevant keyword arguments will be passed into ``Interactions`` object creation, if ``interactions is None`` and the keyword argument matches one of ``Interactions.__init__.__code__.co_varnames``. All other keyword arguments will be passed into ``torch.utils.data.DataLoader``: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader Attributes ---------- interactions: Interactions (default) or ExplicitInteractions Original ``torch.utils.data.DataLoader`` docstring as follows: ######## """ def __init__(self, interactions: BaseInteractions = None, mat: Optional[Union[coo_matrix, np.array]] = None, users: Optional[Iterable[int]] = None, items: Optional[Iterable[int]] = None, ratings: Optional[Iterable[int]] = None, batch_size: int = 1024, shuffle: bool = False, **kwargs): if interactions is None: # find all kwargs in the ``__init__`` for a ``Interactions`` object interactions_only_kwargs = { k: v for k, v in kwargs.items() if k in Interactions.__init__.__code__.co_varnames } # find all kwargs not in the ``__init__`` for a ``Interactions`` object OR all kwargs # that are used in the ``__init__`` for a ``torch.utils.data.DataLoader`` object kwargs = { k: v for k, v in kwargs.items() if k not in Interactions.__init__.__code__.co_varnames or k in torch.utils.data.DataLoader.__init__.__code__.co_varnames } interactions = Interactions(mat=mat, users=users, items=items, ratings=ratings, **interactions_only_kwargs) super().__init__( interactions=interactions, batch_size=batch_size, shuffle=shuffle, **kwargs, ) self.shuffle = shuffle __doc__ += torch.utils.data.DataLoader.__doc__ def __repr__(self) -> str: """String representation of ``InteractionsDataLoader`` class.""" if hasattr(self.interactions, 'num_negative_samples'): extra_repr_str = ( f'{self.num_negative_samples} negative samples per implicit interaction in' ) else: extra_repr_str = 'explicit,' return textwrap.dedent( f''' InteractionsDataLoader object with {self.num_interactions} interactions between {self.num_users} users and {self.num_items} items, returning {extra_repr_str} {'shuffled' if self.shuffle else 'non-shuffled'} batches of size {self.batch_size}. ''' ).replace('\n', ' ').strip()
[docs]class ApproximateNegativeSamplingInteractionsDataLoader(BaseInteractionsDataLoader): """ A computationally more efficient ``DataLoader`` for ``Interactions`` data using approximate negative sampling for negative items. This DataLoader groups ``__getitem__`` calls together into a single operation, which dramatically speeds up a traditional DataLoader's process of calling ``__getitem__`` one index at a time, then concatenating them together before returning. In an effort to batch operations together, all negative samples returned will be approximate, meaning this does not check if a user has previously interacted with the item. With a sufficient number of interactions (1M+), we have found a speed increase of 2x at the cost of a 1% reduction in MAP @ 10 performance compared to ``InteractionsDataLoader``. For greater efficiency, we disable automated batching by setting the DataLoader's ``batch_size`` attribute to ``None``. Thus, to access the "true" batch size that the sampler uses, access ``ApproximateNegativeSamplingInteractionsDataLoader.approximate_negative_sampler.batch_size``. Parameters ---------- interactions: Interactions If not provided, an ``Interactions`` object will be created with ``mat`` or all of ``users``, ``items``, and ``ratings`` with ``max_number_of_samples_to_consider=0`` mat: scipy.sparse.coo_matrix or numpy.array, 2-dimensional If ``interactions is None``, will be used instead of ``users``, ``items``, and ``ratings`` arguments to create an ``Interactions`` object users: Iterable[int], 1-d If ``interactions is None and mat is None``, array of user IDs, starting at 0 items: Iterable[int], 1-d If ``interactions is None and mat is None``, array of corresponding item IDs to ``users``, starting at 0 ratings: Iterable[int], 1-d If ``interactions is None and mat is None``, array of corresponding ratings to both ``users`` and ``items``. If ``None``, will default to each user in ``user`` interacting with an item with a rating value of 1 batch_size: int Number of samples per batch to load shuffle: bool Whether to shuffle the order of data returned or not. This is especially useful for training data to ensure the model does not overfit to a specific order of data **kwargs: keyword arguments Relevant keyword arguments will be passed into ``Interactions`` object creation, if ``interactions is None`` and the keyword argument matches one of ``Interactions.__init__.__code__.co_varnames``. All other keyword arguments will be passed into ``torch.utils.data.DataLoader``: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader Attributes ---------- interactions: Interactions Original ``torch.utils.data.DataLoader`` docstring as follows: ######## """ def __init__(self, interactions: Interactions = None, mat: Optional[Union[coo_matrix, np.array]] = None, users: Optional[Iterable[int]] = None, items: Optional[Iterable[int]] = None, ratings: Optional[Iterable[int]] = None, batch_size: int = 1024, shuffle: bool = False, **kwargs): if isinstance(interactions, ExplicitInteractions): raise ValueError( '``ApproximateNegativeSamplingInteractionsDataLoader`` does not support explicit' ' data types!' ) if interactions is None: interactions_only_kwargs = { k: v for k, v in kwargs.items() if k in Interactions.__init__.__code__.co_varnames } kwargs = { k: v for k, v in kwargs.items() if k not in Interactions.__init__.__code__.co_varnames or k in torch.utils.data.DataLoader.__init__.__code__.co_varnames } interactions = Interactions(mat=mat, users=users, items=items, ratings=ratings, max_number_of_samples_to_consider=0, **interactions_only_kwargs) else: # we need ``max_number_of_samples_to_consider`` to be 0 in order to do approximate # negative sampling interactions.max_number_of_samples_to_consider = 0 approximate_negative_sampler = ApproximateNegativeSampler(interactions=interactions, batch_size=batch_size, shuffle=shuffle, seed=interactions.seed) super().__init__( interactions=interactions, sampler=approximate_negative_sampler, batch_size=None, # Disable automated batching **kwargs, ) self.approximate_negative_sampler = approximate_negative_sampler self.shuffle = shuffle __doc__ += torch.utils.data.DataLoader.__doc__ def __repr__(self) -> str: """String representation of ``ApproximateNegativeSamplingInteractionsDataLoader`` class.""" return textwrap.dedent( f''' ApproximateNegativeSamplingInteractionsDataLoader object with {self.num_interactions} interactions between {self.num_users} users and {self.num_items} items, returning {self.num_negative_samples} negative samples per implicit interaction in {'shuffled' if self.shuffle else 'non-shuffled'} batches of size {self.approximate_negative_sampler.batch_size}. ''' ).replace('\n', ' ').strip()
[docs]class HDF5InteractionsDataLoader(BaseInteractionsDataLoader): """ A light wrapper around a ``torch.utils.data.DataLoader`` for HDF5 data, with behavior very similar to ``ApproximateNegativeSamplingInteractionsDataLoader``. If not provided, a ``HDF5Interactions`` dataset will be created as the data for the ``DataLoader``. A custom sampler, ``HDF5Sampler``, will also be instantiated for the ``DataLoader`` to use that allows sampling in batches that make for faster HDF5 data reads from disk. While similar to a standard ``DataLoader``, note that when ``shuffle is True``, this will only shuffle the order of batches and the data within batches to still make for efficient reading of HDF5 data from disk, rather than shuffling across the entire dataset. For greater efficiency, we disable automated batching by setting the DataLoader's ``batch_size`` attribute to ``None``. Thus, to access the "true" batch size that the sampler uses, access ``HDF5InteractionsDataLoader.hdf5_sampler.batch_size``. Parameters ---------- hdf5_interactions: HDF5Interactions If provided, will override input argument for ``hdf5_path`` hdf5_path: str If ``hdf5_interactions is None``, the path to the HDF5 dataset batch_size: int Number of samples per batch to load shuffle: bool Whether to shuffle the order of batches returned or not. This is especially useful for training data to ensure the model does not overfit to a specific order of data. Note that this will not perform a true shuffle of the data, but shuffle the order of batches. While this is an approximation of true sampling, it allows us a greater speed up during model training for a negligible effect on model performance **kwargs: keyword arguments Relevant keyword arguments will be passed into ``HDF5Interactions`` object creation, if ``hdf5_interactions is None`` and the keyword argument matches one of ``HDF5Interactions.__init__.__code__.co_varnames``. All other keyword arguments will be passed into ``torch.utils.data.DataLoader``: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader Original ``torch.utils.data.DataLoader`` docstring as follows: ######## """ def __init__(self, hdf5_interactions: HDF5Interactions = None, hdf5_path: Optional[str] = None, batch_size: int = 1024, shuffle: bool = False, **kwargs): if hdf5_interactions is None: # find all kwargs in the ``__init__`` for a ``HDF5Interactions`` object interactions_only_kwargs = { k: v for k, v in kwargs.items() if k in HDF5Interactions.__init__.__code__.co_varnames } # find all kwargs not in the ``__init__`` for a ``HDF5Interactions`` object OR all # kwargs that are used in the ``__init__`` for a ``torch.utils.data.DataLoader`` object kwargs = { k: v for k, v in kwargs.items() if k not in HDF5Interactions.__init__.__code__.co_varnames or k in torch.utils.data.DataLoader.__init__.__code__.co_varnames } hdf5_interactions = HDF5Interactions(hdf5_path=hdf5_path, **interactions_only_kwargs) hdf5_sampler = HDF5Sampler(hdf5_interactions=hdf5_interactions, batch_size=batch_size, shuffle=shuffle, seed=hdf5_interactions.seed) super().__init__( interactions=hdf5_interactions, sampler=hdf5_sampler, batch_size=None, # Disable automated batching **kwargs, ) self.hdf5_sampler = hdf5_sampler self.hdf5_path = hdf5_path self.shuffle = shuffle __doc__ += torch.utils.data.DataLoader.__doc__ @property def mat(self) -> None: """``mat`` attribute is not possible to access in ``HDF5InteractionsDataLoader``.""" raise AttributeError('``HDF5InteractionsDataLoader`` cannot support ``mat`` attribute since' ' data is read in from disk dynamically.') def __repr__(self) -> str: """String representation of ``HDF5InteractionsDataLoader`` class.""" return textwrap.dedent( f''' HDF5InteractionsDataLoader object with {self.interactions.num_interactions} interactions between {self.interactions.num_users} users and {self.interactions.num_items} items, returning {self.num_negative_samples} negative samples per implicit interaction in {'shuffled' if self.shuffle else 'non-shuffled'} batches of size {self.hdf5_sampler.batch_size}. ''' ).replace('\n', ' ').strip()