import os
from pathlib import Path
import re
import zipfile
import pandas as pd
import requests
from collie.config import DATA_PATH
[docs]def read_movielens_df(decrement_ids: bool = True) -> pd.DataFrame:
"""
Read ``u.data`` from the MovieLens 100K dataset.
If there is not a directory at ``$DATA_PATH/ml-100k``, this function creates that directory and
downloads the entire dataset there.
See the MovieLens 100K README for additional information on the dataset:
https://files.grouplens.org/datasets/movielens/ml-100k-README.txt
Parameters
----------
decrement_ids: bool
Decrement user and item IDs by 1 before returning, which is required for Collie's
``Interactions`` dataset
Returns
-------
df: pd.DataFrame
MovieLens 100K ``u.data`` comprising of columns:
* user_id
* item_id
* rating
* timestamp
Side Effects
------------
Creates directory at ``$DATA_PATH/ml-100k`` and downloads data files if data does not exist.
"""
_make_data_path_dirs_if_not_exist()
df_path = os.path.join(DATA_PATH, 'ml-100k', 'u.data')
if not Path(df_path).exists():
_download_movielens_100k()
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv(df_path, sep='\t', names=column_names)
if decrement_ids:
df.loc[:, 'user_id'] = df['user_id'] - 1
df.loc[:, 'item_id'] = df['item_id'] - 1
return df
[docs]def read_movielens_df_item() -> pd.DataFrame:
"""
Read ``u.item`` from the MovieLens 100K dataset.
If there is not a directory at ``$DATA_PATH/ml-100k``, this function creates that directory and
downloads the entire dataset there.
See the MovieLens 100K README for additional information on the dataset:
https://files.grouplens.org/datasets/movielens/ml-100k-README.txt
Returns
-------
df_item: pd.DataFrame
MovieLens 100K ``u.item`` containing columns:
* item_id
* movie_title
* release_date
* video_release_date
* IMDb_URL
* unknown
* Action
* Adventure
* Animation
* Children
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film_Noir
* Horror
* Musical
* Mystery
* Romance', 'Sci_Fi
* Thriller
* War
* Wester
Side Effects
------------
Creates directory at ``$DATA_PATH/ml-100k`` and downloads data files if data does not exist.
"""
_make_data_path_dirs_if_not_exist()
df_item_path = os.path.join(DATA_PATH, 'ml-100k', 'u.item')
if not Path(df_item_path).exists():
_download_movielens_100k()
column_names = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL',
'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror', 'Musical', 'Mystery',
'Romance', 'Sci_Fi', 'Thriller', 'War', 'Western']
df_item = pd.read_csv(df_item_path, sep='|', encoding='latin-1', names=column_names)
df_item['release_date'] = pd.to_datetime(df_item['release_date'])
df_item = df_item.drop(columns=['video_release_date'])
return df_item
def read_movielens_df_user() -> pd.DataFrame:
"""
Read ``u.user`` from the MovieLens 100K dataset.
If there is not a directory at ``$DATA_PATH/ml-100k``, this function creates that directory and
downloads the entire dataset there.
See the MovieLens 100K README for additional information on the dataset:
https://files.grouplens.org/datasets/movielens/ml-100k-README.txt
Returns
-------
df_user: pd.DataFrame
MovieLens 100K ``u.user`` containing columns:
* user_id
* age
* gender
* occupation
* zip
Side Effects
------------
Creates directory at ``$DATA_PATH/ml-100k`` and downloads data files if data does not exist.
"""
_make_data_path_dirs_if_not_exist()
df_user_path = os.path.join(DATA_PATH, 'ml-100k', 'u.user')
if not Path(df_user_path).exists():
_download_movielens_100k()
column_names = ['user_id', 'age', 'gender', 'occupation', 'zip']
df_user = pd.read_csv(df_user_path, sep='|', encoding='latin-1', names=column_names)
return df_user
def _make_data_path_dirs_if_not_exist() -> None:
"""Get path to the movielens dataset file."""
if not DATA_PATH.exists():
print(f'Making data path at ``{DATA_PATH}``...')
DATA_PATH.mkdir(parents=True, exist_ok=True)
def _download_movielens_100k() -> None:
"""Download the MovieLens 100K data."""
url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
req = requests.get(url, stream=True)
print('Downloading MovieLens 100K data...')
with open(os.path.join(DATA_PATH, 'ml-100k.zip'), 'wb') as f:
f.write(req.content)
with zipfile.ZipFile(os.path.join(DATA_PATH / 'ml-100k.zip'), 'r') as z:
z.extractall(DATA_PATH)
[docs]def read_movielens_posters_df() -> pd.DataFrame:
"""
Read in data containing the item ID and poster URL for visualization purposes of MovieLens 100K
data.
This function will attempt to read the file at ``data/movielens_posters.csv`` if it exists and,
if not, will read the CSV from the origin GitHub repo at
https://raw.githubusercontent.com/ShopRunner/collie/main/data/movielens_posters.csv.
Returns
-------
posters_df: pd.DataFrame
DataFrame comprising columns:
* item_id
* url
"""
# attempt to first load from a local file
absolute_data_path = Path(__file__).parent.absolute().parent.parent / 'data'
movielens_posters_csv_filepath = os.path.join(absolute_data_path, 'movielens_posters.csv')
# be prepared to read the CSV from the origin GitHub repo as well
movielens_posters_csv_url = (
'https://raw.githubusercontent.com/ShopRunner/collie/main/data/movielens_posters.csv'
)
posters_df = pd.read_csv(
movielens_posters_csv_filepath
if os.path.exists(movielens_posters_csv_filepath)
else movielens_posters_csv_url
)
return posters_df