from fastai.tabular.all import *
from fastai.collab import *
path = untar_data(URLs.ML_100k)
# The main table is in u.data. Since it’s not a proper csv, we have to specify a few things while opening it: the tab delimiter, the columns we want to keep and their names.
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
usecols=(0,1,2), names=['user','movie','rating'])
ratings.head()
user movie rating
0 196 242 3
1 186 302 3
2 22 377 1
3 244 51 2
4 166 346 1
# Movie ids are not ideal to look at things, so we load the corresponding movie id to the title that is in the table u.item:
movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1',
usecols=(0,1), names=('movie','title'), header=None)
movies.head()
movie title
0 1 Toy Story (1995)
1 2 GoldenEye (1995)
2 3 Four Rooms (1995)
3 4 Get Shorty (1995)
4 5 Copycat (1995)
ratings = ratings.merge(movies)
ratings.head()
user movie rating title
0 196 242 3 Kolya (1996)
1 63 242 3 Kolya (1996)
2 226 242 5 Kolya (1996)
3 154 242 3 Kolya (1996)
4 306 242 5 Kolya (1996)
# We can then build a DataLoaders object from this table. By default, it takes the first column for user,
the second column for the item (here our movies) and the third column for the ratings. We need to change the value of item_name in our case, to use the titles instead of the ids:
dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()
user title rating
0 181 Substitute, The (1996) 1
1 189 Ulee's Gold (1997) 3
2 6 L.A. Confidential (1997) 4
3 849 Net, The (1995) 5
4 435 Blade Runner (1982) 4
5 718 My Best Friend's Wedding (1997) 4
# fastai can create and train a collaborative filtering model by using collab_learner:
learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))
learn.fit_one_cycle(5, 5e-3, wd=0.1)
g = ratings.groupby('title')['rating'].count()
top_movies = g.sort_values(ascending=False).index.values[:1000]
top_movies[:10]
> array(['Star Wars (1977)', 'Contact (1997)', 'Fargo (1996)',
'Return of the Jedi (1983)', 'Liar Liar (1997)',
'English Patient, The (1996)', 'Scream (1996)', 'Toy Story (1995)',
'Air Force One (1997)', 'Independence Day (ID4) (1996)'],
dtype=object)
# Our model has learned one bias per movie, a unique number independent of users that can be interpreted as the intrinsic “value” of the movie.
# We can grab the bias of each movie in our top_movies list with the following command:
movie_bias = learn.model.bias(top_movies, is_item=True)
movie_bias.shape
mean_ratings = ratings.groupby('title')['rating'].mean()
movie_ratings = [(b, i, mean_ratings.loc[i]) for i,b in zip(top_movies,movie_bias)]
item0 = lambda o:o[0]
sorted(movie_ratings, key=item0)[:15]
movie_w = learn.model.weight(top_movies, is_item=True)
movie_w.shape