from abc import ABCMeta, abstractmethod
import collections
import random
import textwrap
from typing import Any, Iterable, List, Optional, Tuple, Union
import warnings
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix, dok_matrix
import torch
from tqdm.auto import tqdm
import collie
class BaseInteractions(torch.utils.data.Dataset, metaclass=ABCMeta):
"""
PyTorch ``Dataset`` for implicit user-item interactions data.
If ``mat`` is provided, the ``Interactions`` instance will act as a wrapper for a sparse matrix
in COOrdinate format, typically looking like:
* Users comprising the rows
* Items comprising the columns
* Ratings given by that user for that item comprising the elements of the matrix
``Interactions`` can be instantiated instead by passing in single arrays with corresponding
user_ids, item_ids, and ratings (by default, set to 1 for implicit recommenders) values with
the same functionality as a matrix. Note that with this approach, the number of users and items
will be the maximum values in those two columns, respectively, and it is expected that all
integers between 0 and the maximum ID should appear somewhere in the data.
By default, exact negative sampling will be used during each ``__getitem__`` call. To use
approximate negative sampling, set ``max_number_of_samples_to_consider = 0``. This will avoid
building a positive item lookup dictionary during initialization.
Parameters
----------
mat: scipy.sparse.coo_matrix or numpy.array, 2-dimensional
Interactions matrix, which, if provided, will be used instead of ``users``, ``items``, and
``ratings`` arguments
users: Iterable[int], 1-d
Array of user IDs, starting at 0
items: Iterable[int], 1-d
Array of corresponding item IDs to ``users``, starting at 0
ratings: Iterable[int], 1-d
Array of corresponding ratings to both ``users`` and ``items``. If ``None``, will default to
each user in ``user`` interacting with an item with a rating value of 1
num_negative_samples: int
Number of negative samples to return with each ``__getitem__`` call
allow_missing_ids: bool
If ``False``, will check that both ``users`` and ``items`` contain each integer from 0 to
the maximum value in the array. This check only applies when initializing an
``Interactions`` instance using 1-dimensional arrays ``users`` and ``items``
remove_duplicate_user_item_pairs: bool
Will check for and remove any duplicate user, item ID pairs from the ``Interactions`` matrix
during initialization. Note that this will create a second sparse matrix held in memory
to efficiently check, which could cause memory concerns for larger data. If you are sure
that there are no duplicated, user, item ID pairs, set to ``False``
num_users: int
Number of users in the dataset. If ``num_users == 'infer'``, this will be set to the
``mat.shape[0]`` or ``max(users) + 1``, depending on the input
num_items: int
Number of items in the dataset. If ``num_items == 'infer'``, this will be set to the
``mat.shape[1]`` or ``max(items) + 1``, depending on the input
check_num_negative_samples_is_valid: bool
Check that ``num_negative_samples`` is less than the maximum number of items a user has
interacted with. If it is not, then for all users who have fewer than
``num_negative_samples`` items not interacted with, a random sample including positive items
will be returned as negative
max_number_of_samples_to_consider: int
Number of samples to try for a given user before returning an approximate negative sample.
This should be greater than ``num_negative_samples``. If set to ``0``, approximate negative
sampling will be used by default in ``__getitem__`` and a positive item lookup dictionary
will NOT be built
"""
def __init__(self,
mat: Optional[Union[coo_matrix, np.array]] = None,
users: Optional[Iterable[int]] = None,
items: Optional[Iterable[int]] = None,
ratings: Optional[Iterable[int]] = None,
allow_missing_ids: bool = False,
remove_duplicate_user_item_pairs: bool = True,
num_users: int = 'infer',
num_items: int = 'infer'):
if mat is None:
assert users is not None and items is not None, (
'Either 1) ``mat`` or 2) both ``users`` or ``items`` must be non-null!'
)
if len(users) != len(items):
raise ValueError('Lengths of ``users`` and ``items`` must be equal.')
num_users = collie.utils._infer_num_if_needed_for_1d_array(num_users, users)
num_items = collie.utils._infer_num_if_needed_for_1d_array(num_items, items)
if allow_missing_ids is False:
_check_array_contains_all_integers(array=users,
array_max_value=num_users,
array_name='users')
_check_array_contains_all_integers(array=items,
array_max_value=num_items,
array_name='items')
if ratings is not None:
if len(users) != len(ratings):
raise ValueError(
'Length of ``ratings`` must be equal to lengths of ``users``, ``items``.'
)
mat = collie.utils._create_sparse_ratings_matrix_helper(users=users,
items=items,
ratings=ratings,
num_users=num_users,
num_items=num_items)
else:
mat = coo_matrix(mat)
if num_users == 'infer':
num_users = mat.shape[0]
if num_items == 'infer':
num_items = mat.shape[1]
if allow_missing_ids is False:
_check_array_contains_all_integers(array=mat.row,
array_max_value=num_users,
array_name='mat.shape[0]')
_check_array_contains_all_integers(array=mat.col,
array_max_value=num_items,
array_name='mat.shape[1]')
if remove_duplicate_user_item_pairs:
print('Checking for and removing duplicate user, item ID pairs...')
# remove duplicate entires in the COO matrix
dok_mat = dok_matrix((mat.shape), dtype=mat.dtype)
dok_mat._update(zip(zip(mat.row, mat.col), mat.data))
mat = dok_mat.tocoo()
# trigger garbage collection early
del dok_mat
self.mat = mat
self.allow_missing_ids = allow_missing_ids
self.remove_duplicate_user_item_pairs = remove_duplicate_user_item_pairs
self.num_users = num_users
self.num_items = num_items
self.num_interactions = self.mat.nnz
self.min_rating = self.mat.data.min()
self.max_rating = self.mat.data.max()
@abstractmethod
def __getitem__(self, index: Union[int, Iterable[int]]) -> (
Union[Tuple[Tuple[int, int], np.array], Tuple[Tuple[np.array, np.array], np.array]]
):
"""Access item in the ``BaseInteractions`` instance."""
pass
def __len__(self) -> int:
"""Number of non-zero interactions in the ``BaseInteractions`` instance."""
return self.num_interactions
def todense(self) -> np.matrix:
"""Transforms ``BaseInteractions`` instance sparse matrix to np.matrix, 2-d."""
return self.mat.todense()
def toarray(self) -> np.array:
"""Transforms ``BaseInteractions`` instance sparse matrix to np.array, 2-d."""
return self.mat.toarray()
def head(self, n: int = 5) -> np.array:
"""Return the first ``n`` rows of the dense matrix as a np.array, 2-d."""
n = self._prep_head_tail_n(n=n)
return self.mat.tocsr()[range(n), :].toarray()
def tail(self, n: int = 5) -> np.array:
"""Return the last ``n`` rows of the dense matrix as a np.array, 2-d."""
n = self._prep_head_tail_n(n=n)
return self.mat.tocsr()[range(-n, 0), :].toarray()
def _prep_head_tail_n(self, n: int) -> int:
"""Ensure we don't run into an ``IndexError`` when using ``head`` or ``tail`` methods."""
if n < 0:
n = self.num_users + n
if n > self.num_users:
n = self.num_users
return n
[docs]class Interactions(BaseInteractions):
"""
PyTorch ``Dataset`` for implicit user-item interactions data.
If ``mat`` is provided, the ``Interactions`` instance will act as a wrapper for a sparse matrix
in COOrdinate format, typically looking like:
* Users comprising the rows
* Items comprising the columns
* Ratings given by that user for that item comprising the elements of the matrix
``Interactions`` can be instantiated instead by passing in single arrays with corresponding
user_ids, item_ids, and ratings (by default, set to 1 for implicit recommenders) values with
the same functionality as a matrix. Note that with this approach, the number of users and items
will be the maximum values in those two columns, respectively, and it is expected that all
integers between 0 and the maximum ID should appear somewhere in the data.
By default, exact negative sampling will be used during each ``__getitem__`` call. To use
approximate negative sampling, set ``max_number_of_samples_to_consider = 0``. This will avoid
building a positive item lookup dictionary during initialization.
Unlike in ``ExplicitInteractions``, we rely on negative sampling for implicit data. Each
``__getitem__`` call will thus return a nested tuple containing user IDs, item IDs, and
sampled negative item IDs. This nested vs. non-nested structure is key for the model to
determine where it should be implicit or explicit. Use the table below for reference:
.. list-table::
:header-rows: 1
* - ``__getitem__`` Format
- Expected Meaning
- Model Type
* - ``((X, Y), Z)``
- ``((user IDs, item IDs), negative item IDs)``
- **Implicit**
* - ``(X, Y, Z)``
- ``(user IDs, item IDs, ratings)``
- **Explicit**
Parameters
-------------
mat: scipy.sparse.coo_matrix or numpy.array, 2-dimensional
Interactions matrix, which, if provided, will be used instead of ``users``, ``items``, and
``ratings`` arguments
users: Iterable[int], 1-d
Array of user IDs, starting at 0
items: Iterable[int], 1-d
Array of corresponding item IDs to ``users``, starting at 0
ratings: Iterable[int], 1-d
Array of corresponding ratings to both ``users`` and ``items``. If ``None``, will default to
each user in ``user`` interacting with an item with a rating value of 1
num_negative_samples: int
Number of negative samples to return with each ``__getitem__`` call
allow_missing_ids: bool
If ``False``, will check that both ``users`` and ``items`` contain each integer from 0 to
the maximum value in the array. This check only applies when initializing an
``Interactions`` instance using 1-dimensional arrays ``users`` and ``items``
remove_duplicate_user_item_pairs: bool
Will check for and remove any duplicate user, item ID pairs from the ``Interactions`` matrix
during initialization. Note that this will create a second sparse matrix held in memory
to efficiently check, which could cause memory concerns for larger data. If you are sure
that there are no duplicated, user, item ID pairs, set to ``False``
num_users: int
Number of users in the dataset. If ``num_users == 'infer'``, this will be set to the
``mat.shape[0]`` or ``max(users) + 1``, depending on the input
num_items: int
Number of items in the dataset. If ``num_items == 'infer'``, this will be set to the
``mat.shape[1]`` or ``max(items) + 1``, depending on the input
check_num_negative_samples_is_valid: bool
Check that ``num_negative_samples`` is less than the maximum number of items a user has
interacted with. If it is not, then for all users who have fewer than
``num_negative_samples`` items not interacted with, a random sample including positive items
will be returned as negative
max_number_of_samples_to_consider: int
Number of samples to try for a given user before returning an approximate negative sample.
This should be greater than ``num_negative_samples``. If set to ``0``, approximate negative
sampling will be used by default in ``__getitem__`` and a positive item lookup dictionary
will NOT be built
seed: int
Seed for random sampling
"""
def __init__(self,
mat: Optional[Union[coo_matrix, np.array]] = None,
users: Optional[Iterable[int]] = None,
items: Optional[Iterable[int]] = None,
ratings: Optional[Iterable[int]] = None,
num_negative_samples: int = 10,
allow_missing_ids: bool = False,
remove_duplicate_user_item_pairs: bool = True,
num_users: int = 'infer',
num_items: int = 'infer',
check_num_negative_samples_is_valid: bool = True,
max_number_of_samples_to_consider: int = 200,
seed: Optional[int] = None):
if mat is None and ratings is not None and 0 in set(ratings):
warnings.warn(
'``ratings`` contain ``0``s, which are ignored for implicit data.'
' Filtering these rows out.'
)
indices_to_drop = [idx for idx, rating in enumerate(ratings) if rating == 0]
users = _drop_array_values_by_idx(array=users, indices_to_drop=indices_to_drop)
items = _drop_array_values_by_idx(array=items, indices_to_drop=indices_to_drop)
ratings = _drop_array_values_by_idx(array=ratings, indices_to_drop=indices_to_drop)
super().__init__(mat=mat,
users=users,
items=items,
ratings=ratings,
allow_missing_ids=allow_missing_ids,
remove_duplicate_user_item_pairs=remove_duplicate_user_item_pairs,
num_users=num_users,
num_items=num_items)
if seed is None:
seed = collie.utils.get_random_seed()
self.num_negative_samples = num_negative_samples
self.max_number_of_samples_to_consider = max_number_of_samples_to_consider
self.check_num_negative_samples_is_valid = check_num_negative_samples_is_valid
self.seed = seed
random.seed(self.seed)
assert self.num_negative_samples >= 1
if (
self.num_negative_samples >= self.max_number_of_samples_to_consider
and self.max_number_of_samples_to_consider > 0
):
# no warning for ``max_number_of_samples_to_consider==0`` since it is likely intentional
warnings.warn(
'``num_negative_samples > max_number_of_samples_to_consider``. Approximate negative'
' sampling will be used.'
)
# When an ``Interactions`` is instantiated with exact negative sampling, a
# ``positive_items`` attribute is created, a ``set`` of the ``mat`` object that enables
# fast, O(1), ``(row, col)`` lookup. When ``__getitem__`` is called, negative item IDs are
# sampled one-at-a-time from all possible values in ``self.num_items``, we check if that
# user ID, item ID pair is in ``self.positive_items``, and sample continuously until we
# have a negative match or reach a limit of ``max_number_of_samples_to_consider`` tries
if self.check_num_negative_samples_is_valid:
print('Checking ``num_negative_samples`` is valid...')
counter = collections.Counter(self.mat.row)
max_number_of_items_interacted_with = counter.most_common(1)[0][1]
print('Maximum number of items a user has interacted with: {}'.format(
max_number_of_items_interacted_with
))
del counter
is_valid = (
self.num_negative_samples
< (self.num_items - max_number_of_items_interacted_with)
)
assert is_valid, '``num_negative_samples`` must be less than {}!'.format(
(self.num_items - max_number_of_items_interacted_with)
)
self.positive_items = {}
if self.max_number_of_samples_to_consider > 0:
print('Generating positive items set...')
self._generate_positive_item_set()
def _generate_positive_item_set(self) -> None:
"""Build positive item dictionary lookup for exact negative sampling."""
self.positive_items = set(zip(self.mat.row, self.mat.col))
def __repr__(self) -> str:
"""String representation of ``Interactions`` class."""
return textwrap.dedent(
f'''
Interactions object with {self.num_interactions} interactions between {self.num_users}
users and {self.num_items} items, returning {self.num_negative_samples} negative
samples per interaction.
'''
).replace('\n', ' ').strip()
def __getitem__(self, index: Union[int, Iterable[int]]) -> (
Union[Tuple[Tuple[int, int], np.array], Tuple[Tuple[np.array, np.array], np.array]]
):
"""Access item in the ``Interactions`` instance, returning negative samples as well."""
user_id = self.mat.row[index]
item_id = self.mat.col[index]
# rating = self.mat.data[index] # not needed for any loss currently implemented
negative_item_ids_array = self._negative_sample(user_id)
return (user_id, item_id), negative_item_ids_array
def _negative_sample(self, user_id: Union[int, np.array]) -> np.array:
"""Generate negative samples for a ``user_id``."""
if self.max_number_of_samples_to_consider > 0:
# if we are here, we are doing true negative sampling
negative_item_ids_list = list()
if not isinstance(user_id, collections.abc.Iterable):
user_id = [user_id]
for specific_user_id in user_id:
# generate true negative samples for the ``user_id``
samples_checked = 0
temp_negative_item_ids_list = list()
while len(temp_negative_item_ids_list) < self.num_negative_samples:
negative_item_id = random.choice(range(self.num_items))
# we have a negative sample, make sure the user has not interacted with it
# before, else we resample and try again
while (
(specific_user_id, negative_item_id) in self.positive_items
or negative_item_id in temp_negative_item_ids_list
):
if samples_checked >= self.max_number_of_samples_to_consider:
num_samples_left_to_generate = (
self.num_negative_samples - len(temp_negative_item_ids_list) - 1
)
temp_negative_item_ids_list += random.choices(
range(self.num_items), k=num_samples_left_to_generate
)
break
negative_item_id = random.choice(range(self.num_items))
samples_checked += 1
temp_negative_item_ids_list.append(negative_item_id)
negative_item_ids_list += [np.array(temp_negative_item_ids_list)]
if len(user_id) > 1:
negative_item_ids_array = np.stack(negative_item_ids_list)
else:
negative_item_ids_array = negative_item_ids_list[0]
else:
# if we are here, we are doing approximate negative sampling
if isinstance(user_id, collections.abc.Iterable):
size = (len(user_id), self.num_negative_samples)
else:
size = (self.num_negative_samples,)
negative_item_ids_array = np.random.randint(
low=0,
high=self.num_items,
size=size,
)
return negative_item_ids_array
[docs]class ExplicitInteractions(BaseInteractions):
"""
PyTorch ``Dataset`` for explicit user-item interactions data.
If ``mat`` is provided, the ``Interactions`` instance will act as a wrapper for a sparse matrix
in COOrdinate format, typically looking like:
* Users comprising the rows
* Items comprising the columns
* Ratings given by that user for that item comprising the elements of the matrix
``Interactions`` can be instantiated instead by passing in single arrays with corresponding
user_ids, item_ids, and ratings values with the same functionality as a matrix. Note that with
this approach, the number of users and items will be the maximum values in those two columns,
respectively, and it is expected that all integers between 0 and the maximum ID should appear
somewhere in the user or item ID data.
Unlike in ``Interactions``, there is no need for negative sampling for explicit data. Each
``__getitem__`` call will thus return a single, non-nested tuple containing user IDs, item IDs,
and ratings. This nested vs. non-nested structure is key for the model to determine where it
should be implicit or explicit. Use the table below for reference:
.. list-table::
:header-rows: 1
* - ``__getitem__`` Format
- Expected Meaning
- Model Type
* - ``((X, Y), Z)``
- ``((user IDs, item IDs), negative item IDs)``
- **Implicit**
* - ``(X, Y, Z)``
- ``(user IDs, item IDs, ratings)``
- **Explicit**
Parameters
-------------
mat: scipy.sparse.coo_matrix or numpy.array, 2-dimensional
Interactions matrix, which, if provided, will be used instead of ``users``, ``items``, and
``ratings`` arguments
users: Iterable[int], 1-d
Array of user IDs, starting at 0
items: Iterable[int], 1-d
Array of corresponding item IDs to ``users``, starting at 0
ratings: Iterable[int], 1-d
Array of corresponding ratings to both ``users`` and ``items``. If ``None``, will default to
each user in ``user`` interacting with an item with a rating value of 1
allow_missing_ids: bool
If ``False``, will check that both ``users`` and ``items`` contain each integer from 0 to
the maximum value in the array. This check only applies when initializing an
``ExplicitInteractions`` instance using 1-dimensional arrays ``users`` and ``items``
remove_duplicate_user_item_pairs: bool
Will check for and remove any duplicate user, item ID pairs from the
``ExplicitInteractions`` matrix during initialization. Note that this will create a second
sparse matrix held in memory to efficiently check, which could cause memory concerns for
larger data. If you are sure that there are no duplicated, user, item ID pairs, set to
``False``
num_users: int
Number of users in the dataset. If ``num_users == 'infer'``, this will be set to the
``mat.shape[0]`` or ``max(users) + 1``, depending on the input
num_items: int
Number of items in the dataset. If ``num_items == 'infer'``, this will be set to the
``mat.shape[1]`` or ``max(items) + 1``, depending on the input
"""
def __init__(self,
mat: Optional[Union[coo_matrix, np.array]] = None,
users: Optional[Iterable[int]] = None,
items: Optional[Iterable[int]] = None,
ratings: Optional[Iterable[int]] = None,
allow_missing_ids: bool = False,
remove_duplicate_user_item_pairs: bool = True,
num_users: int = 'infer',
num_items: int = 'infer'):
if mat is None and ratings is None:
raise ValueError(
'Ratings must be provided to ``ExplicitInteractions`` with ``mat`` or ``ratings``'
' - both cannot be ``None``!'
)
super().__init__(mat=mat,
users=users,
items=items,
ratings=ratings,
allow_missing_ids=allow_missing_ids,
remove_duplicate_user_item_pairs=remove_duplicate_user_item_pairs,
num_users=num_users,
num_items=num_items)
@property
def num_negative_samples(self) -> int:
"""Does not exist for explicit data."""
raise AttributeError('``num_negative_samples`` does not exist for explicit datasets.')
def __repr__(self) -> str:
"""String representation of ``ExplicitInteractions`` class."""
return textwrap.dedent(
f'''
ExplicitInteractions object with {self.num_interactions} interactions between
{self.num_users} users and {self.num_items} items, with minimum rating of
{self.min_rating} and maximum rating of {self.max_rating}.
'''
).replace('\n', ' ').strip()
def __getitem__(self, index: Union[int, Iterable[int]]) -> (
Union[Tuple[int, int, np.array], Tuple[np.array, np.array, np.array]]
):
"""Access item in the ``ExplicitInteractions`` instance."""
user_id = self.mat.row[index]
item_id = self.mat.col[index]
rating = self.mat.data[index]
return user_id, item_id, rating
[docs]class HDF5Interactions(torch.utils.data.Dataset):
"""
Create an ``Interactions``-like object for data in the HDF5 format that might be too large to
fit in memory.
Many of the same features of ``Interactions`` are implemented here, with the exception that
approximate negative sampling will always be used.
Parameters
----------
hdf5_path: str
user_col: str
Column in HDF5 file with user IDs. IDs must begin at 0
item_col: str
Column in HDF5 file with item IDs. IDs must begin at 0
num_negative_samples: int
Number of negative samples to return with each ``__getitem__`` call
num_users: int
Number of users in the dataset. If ``num_users == 'infer'`` and there is not a ``meta`` key
in ``hdf5_path``'s HDF5 dataset, this will be set to the the maximum value in
``user_col`` + 1, found by iterating through the entire dataset
num_items: int
Number of items in the dataset. If ``num_items == 'infer'`` and there is not an ``meta`` key
in ``hdf5_path``'s HDF5 dataset, this will be set to the the maximum value in
``item_col`` + 1, found by iterating through the entire dataset
seed: int
Seed for random sampling and shuffling if ``shuffle is True``
shuffle: bool
Shuffle data in a batch. For example, if one calls ``__getitem__`` with
``start_idx_and_batch_size = (0, 4)`` and ``shuffle is False``, this will always return the
data at indices 0, 1, 2, 3 in order. However, the same call with ``shuffle = True`` will
return a random shuffle of 0, 1, 2, 3 each call. This is recommended for use in a
``HDF5InteractionsDataLoader`` for training data in lieu of true data shuffling
"""
def __init__(self,
hdf5_path: str,
user_col: str = 'users',
item_col: str = 'items',
num_negative_samples: int = 10,
num_users: int = 'infer',
num_items: int = 'infer',
seed: Optional[int] = None,
shuffle: bool = False):
self.hdf5_path = hdf5_path
self.user_col = user_col
self.item_col = item_col
self.num_negative_samples = num_negative_samples
self.seed = seed
self.shuffle = shuffle
with pd.HDFStore(self.hdf5_path, mode='r', complib='blosc') as store:
self.num_interactions = store.get_storer('interactions').shape
if isinstance(num_users, int) and isinstance(num_items, int):
self.num_users = num_users
self.num_items = num_items
else:
try:
chunk = store.select('meta')
self.num_users = chunk['num_users'].item()
self.num_items = chunk['num_items'].item()
except KeyError:
print('``meta`` key not found - generating ``num_users`` and ``num_items``.')
self.num_users = 0
self.num_items = 0
# while we are here, we can also check minimum IDs are 0 for free
# TODO: is there a more efficient way to check this? should we always check?
min_user_id = 1
min_item_id = 1
# default Pandas ``chunksize`` is 100000, so we will use that too
chunksize = 100000
for idx in tqdm(range(0, self.num_interactions, chunksize)):
chunk = store.select('interactions', start=idx, stop=(idx + chunksize))
self.num_users = max(chunk[self.user_col].max(), self.num_users)
self.num_items = max(chunk[self.item_col].max(), self.num_items)
min_user_id = min(chunk[self.user_col].min(), min_user_id)
min_item_id = min(chunk[self.item_col].min(), min_item_id)
if min_user_id != 0 or min_item_id != 0:
raise ValueError(
f'Minimum values of {user_col} and {item_col} in HDF5 data must both be'
f' 0, not {min_user_id} and {min_item_id}, respectively.'
)
# add one here since ``users`` and ``items`` are both zero-indexed
self.num_users += 1
self.num_items += 1
assert self.num_users > 1
assert self.num_items > 1
if self.seed is None:
self.seed = collie.utils.get_random_seed()
np.random.seed(seed=self.seed)
def __getitem__(self, start_idx_and_batch_size: Tuple[int, int]) -> (
Tuple[Tuple[np.array, np.array], np.array]
):
"""Get a batch of data."""
if isinstance(start_idx_and_batch_size, tuple):
start_idx, batch_size = start_idx_and_batch_size
else:
start_idx = start_idx_and_batch_size
batch_size = 1
chunk = self._get_data_chunk(start_idx, batch_size)
if len(chunk) == 0:
raise IndexError(f'Index {start_idx} out of range for HDF5 data.')
user_ids = chunk[self.user_col].to_numpy()
item_ids = chunk[self.item_col].to_numpy()
# ratings = chunk[self.ratings_col].to_numpy() # not needed for any implemented loss yet
if self.shuffle:
idxs = np.random.permutation(len(user_ids))
user_ids = user_ids[idxs]
item_ids = item_ids[idxs]
negative_item_ids = np.random.randint(
low=0,
high=self.num_items,
size=(len(user_ids), self.num_negative_samples)
)
return (user_ids, item_ids), negative_item_ids
def _get_data_chunk(self, start_idx: int, batch_size: int) -> pd.DataFrame:
with pd.HDFStore(self.hdf5_path, mode='r', complib='blosc') as store:
return store.select('interactions',
start=start_idx,
stop=(start_idx + batch_size))
def __len__(self) -> int:
"""Get number of batches."""
return self.num_interactions
def __repr__(self) -> str:
"""String representation of ``HDF5Interactions`` class."""
return textwrap.dedent(
f'''
HDF5Interactions object with {self.num_interactions} interactions between
{self.num_users} users and {self.num_items} items, returning
{self.num_negative_samples} negative samples per interaction.
'''
).replace('\n', ' ').strip()
[docs] def head(self, n: int = 5) -> pd.DataFrame:
"""Return the first ``n`` rows of the underlying pd.DataFrame."""
n = self._prep_head_tail_n(n=n)
return self._get_data_chunk(0, n)
[docs] def tail(self, n: int = 5) -> pd.DataFrame:
"""Return the last ``n`` rows of the underlying pd.DataFrame."""
n = self._prep_head_tail_n(n=n)
return self._get_data_chunk(self.num_interactions - n, n)
def _prep_head_tail_n(self, n: int) -> int:
"""Ensure we don't run into an ``IndexError`` when using ``head`` or ``tail`` methods."""
if n < 0:
n = self.num_interactions + n
if n > self.num_interactions:
n = self.num_interactions
return n
def _check_array_contains_all_integers(array: Iterable[int],
array_max_value: int,
array_name: str = 'Array') -> None:
"""Check that an array has all numbers between 0 and ``array_max``."""
if set(array) != set(range(array_max_value)):
raise ValueError(
f'``{array_name}`` must contain every integer between 0 and {array_max_value - 1}. '
+ 'To override this error, set ``allow_missing_ids`` to True.'
)
def _drop_array_values_by_idx(array: Iterable[Any], indices_to_drop: Iterable[int]) -> List[Any]:
return [element for idx, element in enumerate(array) if idx not in indices_to_drop]