Sequence Aware Recommender Systems
A tutorial on Sequence-Aware Recommender Systems
- Data loading
- Data statistics
- Split the dataset
- Fitting the recommender
- Sequential Evaluation
- Analysis of next-item recommendation
- Artifact versioning
- W&B Experiment Link
!mkdir datasets && \
cd datasets && \
wget https://raw.githubusercontent.com/mquad/sars_tutorial/master/datasets/sessions.zip && \
unzip sessions.zip
dataset = create_seq_db_filter_top_k(path=dataset_path, topk=1000, last_months=1)
dataset.head()
def random_holdout(dataset, perc=0.8, seed=1234):
"""
Split sequence dataset randomly
:param dataset: the sequence dataset
:param perc: the training percentange
:param seed: the random seed
:return: the training and test splits
"""
dataset = dataset.sample(frac=1, random_state=seed)
nseqs = len(dataset)
train_size = int(nseqs * perc)
# split data according to the shuffled index and the holdout size
train_split = dataset[:train_size]
test_split = dataset[train_size:]
return train_split, test_split
def temporal_holdout(dataset, ts_threshold):
"""
Split sequence dataset using timestamps
:param dataset: the sequence dataset
:param ts_threshold: the timestamp from which test sequences will start
:return: the training and test splits
"""
train = dataset.loc[dataset['ts'] < ts_threshold]
test = dataset.loc[dataset['ts'] >= ts_threshold]
train, test = clean_split(train, test)
return train, test
def last_session_out_split(data,
user_key='user_id',
session_key='session_id',
time_key='ts'):
"""
Assign the last session of every user to the test set and the remaining ones to the training set
"""
sessions = data.sort_values(by=[user_key, time_key]).groupby(user_key)[session_key]
last_session = sessions.last()
train = data[~data.session_id.isin(last_session.values)].copy()
test = data[data.session_id.isin(last_session.values)].copy()
train, test = clean_split(train, test)
return train, test
def clean_split(train, test):
"""
Remove new items from the test set.
:param train: The training set.
:param test: The test set.
:return: The cleaned training and test sets.
"""
train_items = set()
train['sequence'].apply(lambda seq: train_items.update(set(seq)))
test['sequence'] = test['sequence'].apply(lambda seq: [it for it in seq if it in train_items])
return train, test
def balance_dataset(x, y):
number_of_elements = y.shape[0]
nnz = set(find(y)[0])
zero = set(range(number_of_elements)).difference(nnz)
max_samples = min(len(zero), len(nnz))
nnz_indices = random.sample(nnz, max_samples)
zero_indeces = random.sample(zero, max_samples)
indeces = nnz_indices + zero_indeces
return x[indeces, :], y[indeces, :]
For simplicity, let's split the dataset by assigning the last session of every user to the test set, and all the previous ones to the training set.
train_data, test_data = last_session_out_split(dataset)
print("Train sessions: {} - Test sessions: {}".format(len(train_data), len(test_data)))
class ISeqRecommender(object):
"""Abstract Recommender class"""
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()
def __init__(self):
super(ISeqRecommender, self).__init__()
def fit(self, train_data):
pass
def recommend(self, user_profile, user_id=None):
"""
Given the user profile return a list of recommendation
:param user_profile: the user profile as a list of item identifiers
:param user_id: (optional) the user id
:return: list of recommendations e.g. [([2], 0.875), ([6], 1.0)]
"""
pass
@staticmethod
def get_recommendation_list(recommendation):
return list(map(lambda x: x[0], recommendation))
@staticmethod
def get_recommendation_confidence_list(recommendation):
return list(map(lambda x: x[1], recommendation))
def activate_debug_print(self):
self.logger.setLevel(logging.DEBUG)
def deactivate_debug_print(self):
self.logger.setLevel(logging.INFO)
class PopularityRecommender(ISeqRecommender):
def __init__(self):
super(PopularityRecommender, self).__init__()
def fit(self, train_data):
sequences = train_data['sequence'].values
count_dict = {}
for s in sequences:
for item in s:
if item not in count_dict:
count_dict[item] = 1
else:
count_dict[item] += 1
self.top = sorted(count_dict.items(), key=operator.itemgetter(1), reverse=True)
self.top = [([x[0]], x[1]) for x in self.top]
def recommend(self, user_profile, user_id=None):
return self.top
def get_popular_list(self):
return self.top
poprecommender = PopularityRecommender()
poprecommender.fit(train_data)
This algorithm extract Frequent Sequential Patterns from all the training sequences. Patterns are having support lower than minsup
are discarded (support = # occurrences of a pattern in the traning data).
Recommendations are then generated by looking for patterns having a prefix corresponding to the last [max_context, min_context]
elements in the user profile, taken in order. Matches are then sorted by decreasing confidence score (ratio between the support of the matched rule and the support of the context). Matches having confidence below minconf
are discarded.
The class FSMRecommender
has the following initialization hyper-parameters:
-
minsup
: the minimum support threshold. It is interpreted as relative count if in [0-1], otherwise as an absolute count. NOTE: Relative count required for training with SPFM (faster). -
minconf
: the minimum confidence threshold. Use to filter irrelevent recommendations. -
max_context
: the maximum number of items in the user profile (starting from the last) that will be used for lookup in the database of frequent sequences. -
min_context
: the minimum number of items in the user profile (starting from the last) that will be used for lookup in the database of frequent sequences. -
spmf_path
: path to SPMF jar file. If provided, SPFM library will be used for pattern extraction (algorithm: Prefix Span). Otherwise, use pymining, which can be significantly slower depending on the sequence database size. -
db_path
: path to the sequence database file
class FSMRecommender(ISeqRecommender):
"""Frequent Sequence Mining recommender"""
def __init__(self, minsup, minconf, max_context=1, min_context=1, spmf_path=None, db_path=None):
"""
:param minsup: the minimum support threshold. It is interpreted as relative count if in [0-1],
otherwise as an absolute count. NOTE: Relative count required for training with SPFM (faster).
:param minconf: the minimum confidence threshold.
:param max_context: (optional) the maximum number of items in the user profile (starting from the last) that will be used
for lookup in the database of frequent sequences.
:param min_context: (optional) the minimum number of items in the user profile (starting from the last) that will be used
for lookup in the database of frequent sequences.
:param spmf_path: (optional) path to SPMF jar file. If provided, SPFM library will be used for pattern extraction (algorithm: Prefix Span).
Otherwise, use pymining, which can be significantly slower depending on the sequence database size.
:param db_path: (optional) path to the sequence database file
"""
super(FSMRecommender, self).__init__()
self.minsup = minsup
self.minconf = minconf
self.max_context = max_context
self.min_context = min_context
self.recommendation_length = 1
self.db_path = db_path
self.spmf_path = spmf_path
self.spmf_algorithm = "PrefixSpan"
self.output_path = "tmp/tmp_output.txt"
def __str__(self):
return 'FreqSeqMiningRecommender: ' \
'minsup={minsup}, ' \
'minconf={minconf}, ' \
'max_context={max_context}, ' \
'min_context={min_context}, ' \
'spmf_path={spmf_path}, ' \
'db_path={db_path}'.format(**self.__dict__)
def fit(self, train_data=None):
"""
Fit the model
:param train_data: (optional) DataFrame with the training sequences, which must be assigned to column "sequence".
If None, run FSM using SPFM over the sequence database stored in `self.db_path`.
Otherwise, run FSM using `pymining.seqmining` (slower).
"""
if train_data is None:
if self.spmf_path is None or self.db_path is None:
raise ValueError("You should set db_path and spfm_path before calling fit() without arguments.")
self.logger.info('Using SPFM (Java) for Frequent Sequence Mining')
if 0 <= self.minsup <= 1:
percentage_min_sup = self.minsup * 100
else:
raise NameError("SPMF only accepts 0<=minsup<=1")
# call spmf
command = ' '.join([self.spmf_algorithm, self.db_path, self.output_path, str(percentage_min_sup) + '%'])
callSPMF(self.spmf_path, command)
# parse back output from text file
self._parse_spfm_output()
else:
# use pymining
self.logger.info('Using pymining.seqmining (python) for Frequent Sequence Mining')
sequences = train_data['sequence'].values
msup = int(self.minsup * len(sequences)) if 0 <= self.minsup <= 1 else self.minsup
self.logger.info('Mining frequent sequences (minsup={})'.format(msup))
self.freq_seqs = seqmining.freq_seq_enum(sequences, msup)
self.logger.info('{} frequent sequences found'.format(len(self.freq_seqs)))
self.logger.info('Building the prefix tree')
self.tree = SmartTree()
self.root_node = self.tree.set_root()
for pattern, support in self.freq_seqs:
if len(pattern) == 1:
# add node to root
self.tree.create_node(pattern[0], parent=self.root_node, data={"support": support})
elif len(pattern) > 1:
# add entire path starting from root
self.tree.add_path(self.root_node, pattern, support)
else:
raise ValueError('Frequent sequence of length 0')
self.logger.info('Training completed')
def recommend(self, user_profile, user_id=None):
n = len(user_profile)
c = min(n, self.max_context)
match = []
# iterate over decreasing context lengths until a match with sufficient confidence is found
while not match and c >= self.min_context:
q = user_profile[n - c:n]
match = self._find_match(q, self.recommendation_length)
c -= 1
return match
def _find_match(self, context, recommendation_length):
# search context
lastNode = self.tree.find_path(self.root_node, context)
if lastNode == -1:
return []
else: # context matched
context_support = self.tree[lastNode].data['support']
children = self.tree[lastNode].fpointer
if not children:
return []
# find all path of length recommendation_length from match
paths = self.tree.find_n_length_paths(lastNode, recommendation_length)
return sorted(self._filter_confidence(context_support, paths), key=lambda x: x[1], reverse=True)
def _filter_confidence(self, context_support, path_list):
goodPaths = []
for p in path_list:
confidence = self.tree[p[len(p) - 1]].data['support'] / float(context_support)
if confidence >= self.minconf:
goodPaths.append((self.tree.get_nodes_tag(p), confidence))
return goodPaths
def _set_tree_debug_only(self, tree):
self.tree = tree
self.root_node = tree.get_root()
def get_freq_seqs(self):
return self.freq_seqs
def get_sequence_tree(self):
return self.tree
def show_tree(self):
self.tree.show()
def get_confidence_list(self, recommendation):
return list(map(lambda x: x[1], recommendation))
def _parse_spfm_output(self):
with open(self.output_path, 'r') as fin:
self.freq_seqs = []
for line in fin:
pieces = line.split('#SUP: ')
support = pieces[1].strip()
items = pieces[0].split(' ')
seq = tuple(x for x in items if x != '' and x != '-1')
seq_and_support = (seq, int(support))
self.freq_seqs.append(seq_and_support)
db_path = 'tmp/sequences.txt'
sequences_to_spfm_format(train_data['sequence'], tmp_path=db_path)
# then we instantiate and fit the recommender
fsmrecommender = FSMRecommender(minsup=0.002,
minconf=0.1,
min_context=1,
max_context=10,
spmf_path='spmf/spmf.jar',
db_path=db_path)
# calling fit() without arguments to use SPFM and the sequences stored in db_path
fsmrecommender.fit()
Here we fit the recommedation algorithm over the sessions in the training set.
This recommender is based on the MarkovChainRecommender
implemented from:
Shani, Guy, David Heckerman, and Ronen I. Brafman. "An MDP-based recommender system." Journal of Machine Learning Research 6, no. Sep (2005): 1265-1295. Chapter 3-4
This recommender computes the item transition matrices for any Markov Chain having order in [min_order, max_order]
. Each individual Markov Chain model employes some heristics like skipping or clustering to deal better with data sparsity. Recommendations are generated by sorting items by their transition probability to being next, given the user profile. The scores coming from different MC are weighted inversely wrt to their order.
The class MixedMarkovChainRecommender
has the following initialization hyper-parameters:
-
min_order
: the minimum order of the Mixed Markov Chain -
max_order
: the maximum order of the Mixed Markov Chain
class MarkovChainRecommender(ISeqRecommender):
"""
Implementation from Shani, Guy, David Heckerman, and Ronen I. Brafman. "An MDP-based recommender system."
Journal of Machine Learning Research 6, no. Sep (2005): 1265-1295. Chapter 3-4
"""
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def __init__(self, order):
"""
:param order: the order of the Markov Chain
"""
super(MarkovChainRecommender, self).__init__()
self.order = order
def fit(self, train_data):
sequences = train_data['sequence'].values
logging.info('Building Markov Chain model with k = ' + str(self.order))
logging.info('Adding nodes')
self.tree, self.count_dict, self.G = add_nodes_to_graph(sequences, self.order)
logging.info('Adding edges')
self.G = add_edges(self.tree, self.count_dict, self.G, self.order)
logging.info('Applying skipping')
self.G = apply_skipping(self.G, self.order, sequences)
logging.info('Applying clustering')
logging.info('{} states in the graph'.format(len(self.G.nodes())))
self.G, _, _ = apply_clustering(self.G)
# drop not useful resources
self.tree = None
self.count_dict = None
gc.collect()
def recommend(self, user_profile, user_id=None):
# if the user profile is longer than the markov order, chop it keeping recent history
state = tuple(user_profile[-self.order:])
# see if graph has that state
recommendations = []
if self.G.has_node(state):
# search for recommendations in the forward star
rec_dict = {}
for u, v in self.G.out_edges_iter([state]):
lastElement = tuple(v[-1:])
if lastElement in rec_dict:
rec_dict[lastElement] += self.G[u][v]['count']
else:
rec_dict[lastElement] = self.G[u][v]['count']
for k, v in rec_dict.items():
recommendations.append((list(k), v))
return recommendations
def _set_graph_debug(self, G):
self.G = G
class MixedMarkovChainRecommender(ISeqRecommender):
"""
Creates markov models with different values of k, and return recommendation by weighting the list of
recommendation of each model.
Reference: Shani, Guy, David Heckerman, and Ronen I. Brafman. "An MDP-based recommender system."
Journal of Machine Learning Research 6, no. Sep (2005): 1265-1295. Chapter 3-4
"""
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
recommenders = {}
def __init__(self, min_order=1, max_order=1):
"""
:param min_order: the minimum order of the Mixed Markov Chain
:param max_order: the maximum order of the Mixed Markov Chain
"""
super(MixedMarkovChainRecommender, self).__init__()
self.min_order = min_order
self.max_order = max_order
# define the models
for i in range(self.min_order, self.max_order + 1):
self.recommenders[i] = MarkovChainRecommender(i)
def fit(self, user_profile):
for order in self.recommenders:
self.recommenders[order].fit(user_profile)
def recommend(self, user_profile, user_id=None):
rec_dict = {}
recommendations = []
sum_of_weights = 0
for order, r in self.recommenders.items():
rec_list = r.recommend(user_profile)
sum_of_weights += 1 / order
for i in rec_list:
if tuple(i[0]) in rec_dict:
rec_dict[tuple(i[0])] += 1 / order * i[1]
else:
rec_dict[tuple(i[0])] = 1 / order * i[1]
for k, v in rec_dict.items():
recommendations.append((list(k), v / sum_of_weights))
return recommendations
def _set_model_debug(self, recommender, order):
self.recommenders[order] = recommender
mmcrecommender = MixedMarkovChainRecommender(min_order=1, max_order=1)
mmcrecommender.fit(train_data)
Here we fit the recommedation algorithm over the sessions in the training set.
This recommender is based on the following paper:
Rendle, S., Freudenthaler, C., & Schmidt-Thieme, L. (2010). Factorizing personalized Markov chains for next-basket recommendation. Proceedings of the 19th International Conference on World Wide Web - WWW ’10, 811
In short, FPMC factorizes a personalized order-1 transition tensor using Tensor Factorization with pairwise loss function akin to BPR (Bayesian Pairwise Ranking).
TF allows to impute values for the missing transitions between items for each user. For this reason, FPMC can be used for generating personalized recommendations in session-aware recommenders as well.
In this notebook, you will be able to change the number of latent factors and a few other learning hyper-parameters and see the impact on the recommendation quality.
The class FPMCRecommender
has the following initialization hyper-parameters:
-
n_factor
: (optional) the number of latent factors -
learn_rate
: (optional) the learning rate -
regular
: (optional) the L2 regularization coefficient -
n_epoch
: (optional) the number of training epochs -
n_neg
: (optional) the number of negative samples used in BPR learning
class FPMCRecommender(ISeqRecommender):
"""
Implementation of
Rendle, S., Freudenthaler, C., & Schmidt-Thieme, L. (2010). Factorizing personalized Markov chains for next-basket recommendation.
Proceedings of the 19th International Conference on World Wide Web - WWW ’10, 811
Based on the implementation available at https://github.com/khesui/FPMC
"""
def __init__(self, n_factor=32, learn_rate=0.01, regular=0.001, n_epoch=15, n_neg=10):
"""
:param n_factor: (optional) the number of latent factors
:param learn_rate: (optional) the learning rate
:param regular: (optional) the L2 regularization coefficient
:param n_epoch: (optional) the number of training epochs
:param n_neg: (optional) the number of negative samples used in BPR learning
"""
super(FPMCRecommender, self).__init__()
self.n_epoch = n_epoch
self.n_neg = n_neg
self.n_factor = n_factor
self.learn_rate = learn_rate
self.regular = regular
def __str__(self):
return 'FPMCRecommender(n_epoch={n_epoch}, ' \
'n_neg={n_neg}, ' \
'n_factor={n_factor}, ' \
'learn_rate={learn_rate}, ' \
'regular={regular})'.format(**self.__dict__)
def fit(self, train_data):
self._declare(train_data)
train_data_supervised = []
for i, row in train_data.iterrows():
u = self.user_mapping[row['user_id']]
seq = []
if len(row['sequence']) > 1: # cannot use sequences with length 1 for supervised learning
for item in row['sequence']:
i = self.item_mapping[item]
seq.append(i)
train_data_supervised.append((u, seq[len(seq) - 1], seq[:len(seq) - 1]))
self.fpmc = FPMC(n_user=len(self.user_mapping), n_item=len(self.item_mapping),
n_factor=self.n_factor, learn_rate=self.learn_rate, regular=self.regular)
self.fpmc.user_set = set(self.user_mapping.values())
self.fpmc.item_set = set(self.item_mapping.values())
self.fpmc.init_model()
self.fpmc.learnSBPR_FPMC(train_data_supervised, n_epoch=self.n_epoch, neg_batch_size=self.n_neg)
def recommend(self, user_profile, user_id=None):
context = []
for item in user_profile:
context.append(self.item_mapping[item])
items, scores = self.fpmc.evaluation_recommender(self.user_mapping[user_id], context)
recommendations = []
for i, it in enumerate(items):
recommendations.append(([self.reverse_item_mapping[it]], scores[i]))
return recommendations
def _declare(self, data):
self.user_mapping = {}
self.item_mapping = {}
self.reverse_item_mapping = {}
user_counter = 0
item_counter = 0
for i, row in data.iterrows():
if row['user_id'] not in self.user_mapping:
self.user_mapping[row['user_id']] = user_counter
user_counter += 1
for item in row['sequence']:
if item not in self.item_mapping:
self.item_mapping[item] = item_counter
self.reverse_item_mapping[item_counter] = item
item_counter += 1
fpmcrecommender = FPMCRecommender(n_factor=16, n_epoch=5)
fpmcrecommender.fit(train_data)
Here we fit the recommedation algorithm over the sessions in the training set.
This is simplified implementation of the following:
Grbovic, Mihajlo, Vladan Radosavljevic, Nemanja Djuric, Narayan Bhamidipati, Jaikit Savla, Varun Bhagwan, and Doug Sharp. "E-commerce in your inbox: Product recommendations at scale." In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1809-1818. ACM, 2015.
This implementation uses the gensim
implementation of Word2Vec to compute item embeddings using the skip-gram model.
Recommendations are generated by returning the k-nearest neighbors of the last items in the user profile, whose relevance is weighted using a simple exponential decay (the last item in the user profile is the most relevant one, and the first item the least relevant).
The original paper contains other variants of this algorithm (namely bagged-prod2vec and prod2vec-cluster) which are not subject of this tutorial.
The class Prod2VecRecommender
has the following initialization hyper-parameters:
-
min_count
: the minimum item frequency. Items less frequent that min_count will be pruned -
size
: the size of the embeddings -
window
: the size of the context window -
decay_alpha
: the exponential decay factor used to discount the similarity scores for items back in the user profile. Lower values mean higher discounting of past user interactions. Allows values in [0-1] -
workers
: the number of threads used for training
class Prod2VecRecommender(ISeqRecommender):
"""
Implementation of the Prod2Vec skipgram model from
Grbovic Mihajlo, Vladan Radosavljevic, Nemanja Djuric, Narayan Bhamidipati, Jaikit Savla, Varun Bhagwan, and Doug Sharp.
"E-commerce in your inbox: Product recommendations at scale."
In Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining,
pp. 1809-1818. ACM, 2015.
"""
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def __init__(self, min_count=2, size=100, window=5, decay_alpha=0.9, workers=4):
"""
:param min_count: (optional) the minimum item frequency. Items less frequent that min_count will be pruned
:param size: (optional) the size of the embeddings
:param window: (optional) the size of the context window
:param decay_alpha: (optional) the exponential decay factor used to discount the similarity scores for items
back in the user profile. Lower values mean higher discounting of past user interactions. Allows values in [0-1].
:param workers: (optional) the number of threads used for training
"""
super(Prod2VecRecommender, self).__init__()
self.min_count = min_count
self.size = size
self.window = window
self.decay_alpha = decay_alpha
self.workers = workers
def __str__(self):
return 'Prod2VecRecommender(min_count={min_count}, ' \
'size={size}, ' \
'window={window}, ' \
'decay_alpha={decay_alpha}, ' \
'workers={workers})'.format(**self.__dict__)
def fit(self, train_data):
sequences = train_data['sequence'].values
self.model = gensim.models.Word2Vec(sequences,
min_count=self.min_count,
window=self.window,
hs=1,
size=self.size,
sg=1,
workers=self.workers)
def recommend(self, user_profile, user_id=None):
user_profile = list(map(str, user_profile))
rec = []
try:
# iterate the user profile backwards
for i, item in enumerate(user_profile[::-1]):
ms = self.model.most_similar(positive=item)
# apply exponential decay to the similarity scores
decay = self.decay_alpha ** i
ms = [(x[0], decay * x[1]) for x in ms]
rec.extend(ms)
# sort items by similarity score
rec = sorted(rec, key=lambda x: -x[1])
except KeyError:
rec = []
return [([x[0]], x[1]) for x in rec]
p2vrecommender = Prod2VecRecommender(min_count=2,
size=50,
window=5,
decay_alpha=0.9,
workers=4)
p2vrecommender.fit(train_data)
Here we fit the recommedation algorithm over the sessions in the training set.
This is a simplified interface to Recurrent Neural Network models for Session-based recommendation. Based on the following two papers:
- Recurrent Neural Networks with Top-k Gains for Session-based Recommendations, Hidasi and Karatzoglou, CIKM 2018
- Personalizing Session-based Recommendation with Hierarchical Recurrent Neural Networks, Quadrana et al, Recsys 2017
In this notebook, we will consider the session-based (non-personalized) version of the algorithm. Here's a schematic representation of the model:
Each item in the current user session is first encoded either using 1-hot encoding or a dense embedding vector. The item representation is then forwarded to one or more Gated Reucurrent Unit (GRU) layers, which "mix" the information coming from the past steps of the sequence with the representation of the current item. The last hidden state of the network is finally use to compute the likelihood scores for the next items by using one out of several loss functions (e.g. cross-entropy, BPR, TOP1, BPR-max, TOP1-max, etc.).
For simplicity, we only support 1-hot encoded inputs and the BPR-max loss function here.
The hyper-parameters of the model are:
-
session_layers
: number of units per layer used at session level. It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks. -
user_layers
: number of units per layer used at user level. Required only by personalized models. (None
in this case) -
batch_size
: the mini-batch size used in training -
learning_rate
: the learning rate used in training (Adagrad optimized) -
momentum
: the momentum coefficient used in training -
dropout
: it's a float value for the hidden-layer(s) dropout. -
epochs
: number of training epochs -
personalized
: whether to train a personalized model using the HRNN model (False
in this case).
NOTE: GRU4Rec originally has many more hyper-parameters. Going through all of them is out from the scope of this tutorial, but we suggest to check-out the original source code here if you are interested.
class GRU4Rec:
'''
GRU4Rec(loss='bpr-max', final_act='elu-1', hidden_act='tanh', layers=[100],
n_epochs=10, batch_size=32, dropout_p_hidden=0.0, dropout_p_embed=0.0, learning_rate=0.1, momentum=0.0, lmbd=0.0, embedding=0, n_sample=2048, sample_alpha=0.75, smoothing=0.0, constrained_embedding=False,
adapt='adagrad', adapt_params=[], grad_cap=0.0, bpreg=1.0,
sigma=0.0, init_as_normal=False, train_random_order=False, time_sort=True,
session_key='SessionId', item_key='ItemId', time_key='Time')
Initializes the network.
Parameters
-----------
loss : 'top1', 'bpr', 'cross-entropy', 'xe_logit', 'top1-max', 'bpr-max'
selects the loss function (default : 'bpr-max')
final_act : 'softmax', 'linear', 'relu', 'tanh', 'softmax_logit', 'leaky-<X>', 'elu-<X>', 'selu-<X>-<Y>'
selects the activation function of the final layer, <X> and <Y> are the parameters of the activation function (default : 'elu-1')
hidden_act : 'linear', 'relu', 'tanh', 'leaky-<X>', 'elu-<X>', 'selu-<X>-<Y>'
selects the activation function on the hidden states, <X> and <Y> are the parameters of the activation function (default : 'tanh')
layers : list of int values
list of the number of GRU units in the layers (default : [100])
n_epochs : int
number of training epochs (default: 10)
batch_size : int
size of the minibacth, also effect the number of negative samples through minibatch based sampling (default: 32)
dropout_p_hidden : float
probability of dropout of hidden units (default: 0.0)
dropout_p_embed : float
probability of dropout of the input units, applicable only if embeddings are used (default: 0.0)
learning_rate : float
learning rate (default: 0.05)
momentum : float
if not zero, Nesterov momentum will be applied during training with the given strength (default: 0.0)
lmbd : float
coefficient of the L2 regularization (default: 0.0)
embedding : int
size of the embedding used, 0 means not to use embedding (default: 0)
n_sample : int
number of additional negative samples to be used (besides the other examples of the minibatch) (default: 2048)
sample_alpha : float
the probability of an item used as an additional negative sample is supp^sample_alpha (default: 0.75)
(e.g.: sample_alpha=1 --> popularity based sampling; sample_alpha=0 --> uniform sampling)
smoothing : float
(only works with cross-entropy and xe_logit losses) if set to non-zero class labels are smoothed with this value, i.e. the expected utput is (e/N, ..., e/N, 1-e+e/N, e/N, ..., e/N) instead of (0, ..., 0, 1, 0, ..., 0), where N is the number of outputs and e is the smoothing value (default: 0.0)
constrained_embedding : bool
if True, the output weight matrix is also used as input embedding (default: False)
adapt : None, 'adagrad', 'rmsprop', 'adam', 'adadelta'
sets the appropriate learning rate adaptation strategy, use None for standard SGD (default: 'adagrad')
adapt_params : list
parameters for the adaptive learning methods (default: [])
grad_cap : float
clip gradients that exceede this value to this value, 0 means no clipping (default: 0.0)
bpreg : float
score regularization coefficient for the BPR-max loss function (default: 1.0)
sigma : float
"width" of initialization; either the standard deviation or the min/max of the init interval (with normal and uniform initializations respectively); 0 means adaptive normalization (sigma depends on the size of the weight matrix); (default: 0.0)
init_as_normal : boolean
False: init from uniform distribution on [-sigma,sigma]; True: init from normal distribution N(0,sigma); (default: False)
train_random_order : boolean
whether to randomize the order of sessions in each epoch (default: False)
time_sort : boolean
whether to ensure the the order of sessions is chronological (default: True)
session_key : string
header of the session ID column in the input file (default: 'SessionId')
item_key : string
header of the item ID column in the input file (default: 'ItemId')
time_key : string
header of the timestamp column in the input file (default: 'Time')
'''
def __init__(self, loss='bpr-max', final_act='linear', hidden_act='tanh', layers=[100],
n_epochs=10, batch_size=32, dropout_p_hidden=0.0, dropout_p_embed=0.0, learning_rate=0.1, momentum=0.0,
lmbd=0.0, embedding=0, n_sample=2048, sample_alpha=0.75, smoothing=0.0, constrained_embedding=False,
adapt='adagrad', adapt_params=[], grad_cap=0.0, bpreg=1.0,
sigma=0.0, init_as_normal=False, train_random_order=False, time_sort=True,
session_key='SessionId', item_key='ItemId', time_key='Time'):
self.layers = layers
self.n_epochs = n_epochs
self.batch_size = batch_size
self.dropout_p_hidden = dropout_p_hidden
self.dropout_p_embed = dropout_p_embed
self.learning_rate = learning_rate
self.adapt_params = adapt_params
self.momentum = momentum
self.sigma = sigma
self.init_as_normal = init_as_normal
self.session_key = session_key
self.item_key = item_key
self.time_key = time_key
self.grad_cap = grad_cap
self.bpreg = bpreg
self.train_random_order = train_random_order
self.lmbd = lmbd
self.embedding = embedding
self.constrained_embedding = constrained_embedding
self.time_sort = time_sort
self.adapt = adapt
self.loss = loss
self.set_loss_function(self.loss)
self.final_act = final_act
self.set_final_activation(self.final_act)
self.hidden_act = hidden_act
self.set_hidden_activation(self.hidden_act)
self.n_sample = n_sample
self.sample_alpha = sample_alpha
self.smoothing = smoothing
def set_loss_function(self, loss):
if loss == 'cross-entropy':
self.loss_function = self.cross_entropy
elif loss == 'bpr':
self.loss_function = self.bpr
elif loss == 'bpr-max':
self.loss_function = self.bpr_max
elif loss == 'top1':
self.loss_function = self.top1
elif loss == 'top1-max':
self.loss_function = self.top1_max
elif loss == 'xe_logit':
self.loss_function = self.cross_entropy_logits
else:
raise NotImplementedError
def set_final_activation(self, final_act):
if final_act == 'linear':
self.final_activation = self.linear
elif final_act == 'relu':
self.final_activation = self.relu
elif final_act == 'softmax':
self.final_activation = self.softmax
elif final_act == 'tanh':
self.final_activation = self.tanh
elif final_act == 'softmax_logit':
self.final_activation = self.softmax_logit
elif final_act.startswith('leaky-'):
self.final_activation = self.LeakyReLU(float(final_act.split('-')[1])).execute
elif final_act.startswith('elu-'):
self.final_activation = self.Elu(float(final_act.split('-')[1])).execute
elif final_act.startswith('selu-'):
self.final_activation = self.Selu(*[float(x) for x in final_act.split('-')[1:]]).execute
else:
raise NotImplementedError
def set_hidden_activation(self, hidden_act):
if hidden_act == 'relu':
self.hidden_activation = self.relu
elif hidden_act == 'tanh':
self.hidden_activation = self.tanh
elif hidden_act == 'linear':
self.hidden_activation = self.linear
elif hidden_act.startswith('leaky-'):
self.hidden_activation = self.LeakyReLU(float(hidden_act.split('-')[1])).execute
elif hidden_act.startswith('elu-'):
self.hidden_activation = self.Elu(float(hidden_act.split('-')[1])).execute
elif hidden_act.startswith('selu-'):
self.hidden_activation = self.Selu(*[float(x) for x in hidden_act.split('-')[1:]]).execute
else:
raise NotImplementedError
def set_params(self, **kvargs):
maxk_len = np.max([len(x) for x in kvargs.keys()])
maxv_len = np.max([len(x) for x in kvargs.values()])
for k, v in kvargs.items():
if not hasattr(self, k):
print('Unkown attribute: {}'.format(k))
raise NotImplementedError
else:
if k == 'adapt_params':
v = [float(l) for l in v.split('/')]
elif type(getattr(self, k)) == list:
v = [int(l) for l in v.split('/')]
if type(getattr(self, k)) == bool:
if v == 'True' or v == '1':
v = True
elif v == 'False' or v == '0':
v = False
else:
print('Invalid value for boolean parameter: {}'.format(v))
raise NotImplementedError
setattr(self, k, type(getattr(self, k))(v))
if k == 'loss': self.set_loss_function(self.loss)
if k == 'final_act': self.set_final_activation(self.final_act)
if k == 'hidden_act': self.set_hidden_activation(self.hidden_act)
print('SET {}{}TO {}{}(type: {})'.format(k, ' ' * (maxk_len - len(k) + 3), getattr(self, k),
' ' * (maxv_len - len(str(getattr(self, k))) + 3),
type(getattr(self, k))))
######################ACTIVATION FUNCTIONS#####################
def linear(self, X):
return X
def tanh(self, X):
return T.tanh(X)
def softmax(self, X):
e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')
def softmax_logit(self, X):
X = X - X.max(axis=1).dimshuffle(0, 'x')
return T.log(T.exp(X).sum(axis=1).dimshuffle(0, 'x')) - X
def softmax_neg(self, X):
hm = 1.0 - T.eye(*X.shape)
X = X * hm
e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x')) * hm
return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')
def relu(self, X):
return T.maximum(X, 0)
def sigmoid(self, X):
return T.nnet.sigmoid(X)
class Selu:
def __init__(self, lmbd, alpha):
self.lmbd = lmbd
self.alpha = alpha
def execute(self, X):
return self.lmbd * T.switch(T.ge(X, 0), X, self.alpha * (T.exp(X) - 1))
class Elu:
def __init__(self, alpha):
self.alpha = alpha
def execute(self, X):
return T.switch(T.ge(X, 0), X, self.alpha * (T.exp(X) - 1))
class LeakyReLU:
def __init__(self, leak):
self.leak = leak
def execute(self, X):
return T.switch(T.ge(X, 0), X, self.leak * X)
#################################LOSS FUNCTIONS################################
def cross_entropy(self, yhat, M):
if self.smoothing:
n_out = M + self.n_sample
return T.cast(T.mean(
(1.0 - (n_out / (n_out - 1)) * self.smoothing) * (-T.log(gpu_diag_wide(yhat) + 1e-24)) + (
self.smoothing / (n_out - 1)) * T.sum(-T.log(yhat + 1e-24), axis=1)), theano.config.floatX)
else:
return T.cast(T.mean(-T.log(gpu_diag_wide(yhat) + 1e-24)), theano.config.floatX)
def cross_entropy_logits(self, yhat, M):
if self.smoothing:
n_out = M + self.n_sample
return T.cast(T.mean((1.0 - (n_out / (n_out - 1)) * self.smoothing) * gpu_diag_wide(yhat) + (
self.smoothing / (n_out - 1)) * T.sum(yhat, axis=1)), theano.config.floatX)
else:
return T.cast(T.mean(gpu_diag_wide(yhat)), theano.config.floatX)
def bpr(self, yhat, M):
return T.cast(T.mean(-T.log(T.nnet.sigmoid(gpu_diag_wide(yhat).dimshuffle((0, 'x')) - yhat))),
theano.config.floatX)
def bpr_max(self, yhat, M):
softmax_scores = self.softmax_neg(yhat)
return T.cast(T.mean(-T.log(
T.sum(T.nnet.sigmoid(gpu_diag_wide(yhat).dimshuffle((0, 'x')) - yhat) * softmax_scores,
axis=1) + 1e-24) + self.bpreg * T.sum((yhat ** 2) * softmax_scores, axis=1)), theano.config.floatX)
def top1(self, yhat, M):
ydiag = gpu_diag_wide(yhat).dimshuffle((0, 'x'))
return T.cast(T.mean(
T.mean(T.nnet.sigmoid(-ydiag + yhat) + T.nnet.sigmoid(yhat ** 2), axis=1) - T.nnet.sigmoid(ydiag ** 2) / (
M + self.n_sample)), theano.config.floatX)
def top1_max(self, yhat, M):
softmax_scores = self.softmax_neg(yhat)
y = softmax_scores * (
T.nnet.sigmoid(-gpu_diag_wide(yhat).dimshuffle((0, 'x')) + yhat) + T.nnet.sigmoid(yhat ** 2))
return T.cast(T.mean(T.sum(y, axis=1)), theano.config.floatX)
###############################################################################
def floatX(self, X):
return np.asarray(X, dtype=theano.config.floatX)
def init_weights(self, shape, name=None):
return theano.shared(self.init_matrix(shape), borrow=True, name=name)
def init_matrix(self, shape):
if self.sigma != 0:
sigma = self.sigma
else:
sigma = np.sqrt(6.0 / (shape[0] + shape[1]))
if self.init_as_normal:
return self.floatX(np.random.randn(*shape) * sigma)
else:
return self.floatX(np.random.rand(*shape) * sigma * 2 - sigma)
def extend_weights(self, W, n_new):
matrix = W.get_value()
sigma = self.sigma if self.sigma != 0 else np.sqrt(6.0 / (matrix.shape[0] + matrix.shape[1] + n_new))
if self.init_as_normal:
new_rows = self.floatX(np.random.randn(n_new, matrix.shape[1]) * sigma)
else:
new_rows = self.floatX(np.random.rand(n_new, matrix.shape[1]) * sigma * 2 - sigma)
W.set_value(np.vstack([matrix, new_rows]))
def init(self, data):
data.sort_values([self.session_key, self.time_key], inplace=True)
offset_sessions = np.zeros(data[self.session_key].nunique() + 1, dtype=np.int32)
offset_sessions[1:] = data.groupby(self.session_key).size().cumsum()
np.random.seed(42)
self.Wx, self.Wh, self.Wrz, self.Bh, self.H = [], [], [], [], []
if self.constrained_embedding:
n_features = self.layers[-1]
elif self.embedding:
self.E = self.init_weights((self.n_items, self.embedding), name='E')
n_features = self.embedding
else:
n_features = self.n_items
for i in range(len(self.layers)):
m = []
m.append(self.init_matrix((self.layers[i - 1] if i > 0 else n_features, self.layers[i])))
m.append(self.init_matrix((self.layers[i - 1] if i > 0 else n_features, self.layers[i])))
m.append(self.init_matrix((self.layers[i - 1] if i > 0 else n_features, self.layers[i])))
self.Wx.append(
theano.shared(value=np.hstack(m), borrow=True, name='Wx{}'.format(i))) # For compatibility's sake
self.Wh.append(self.init_weights((self.layers[i], self.layers[i]), name='Wh{}'.format(i)))
m2 = []
m2.append(self.init_matrix((self.layers[i], self.layers[i])))
m2.append(self.init_matrix((self.layers[i], self.layers[i])))
self.Wrz.append(
theano.shared(value=np.hstack(m2), borrow=True, name='Wrz{}'.format(i))) # For compatibility's sake
self.Bh.append(theano.shared(value=np.zeros((self.layers[i] * 3,), dtype=theano.config.floatX), borrow=True,
name='Bh{}'.format(i)))
self.H.append(theano.shared(value=np.zeros((self.batch_size, self.layers[i]), dtype=theano.config.floatX),
borrow=True, name='H{}'.format(i)))
self.Wy = self.init_weights((self.n_items, self.layers[-1]), name='Wy')
self.By = theano.shared(value=np.zeros((self.n_items, 1), dtype=theano.config.floatX), borrow=True, name='By')
return offset_sessions
def dropout(self, X, drop_p):
if drop_p > 0:
retain_prob = 1 - drop_p
X *= mrng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) / retain_prob
return X
def adam(self, param, grad, updates, sample_idx=None, epsilon=1e-6):
v1 = np.float32(self.adapt_params[0])
v2 = np.float32(1.0 - self.adapt_params[0])
v3 = np.float32(self.adapt_params[1])
v4 = np.float32(1.0 - self.adapt_params[1])
acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
meang = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
countt = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
if sample_idx is None:
acc_new = v3 * acc + v4 * (grad ** 2)
meang_new = v1 * meang + v2 * grad
countt_new = countt + 1
updates[acc] = acc_new
updates[meang] = meang_new
updates[countt] = countt_new
else:
acc_s = acc[sample_idx]
meang_s = meang[sample_idx]
countt_s = countt[sample_idx]
# acc_new = v3 * acc_s + v4 * (grad**2) #Faster, but inaccurate when an index occurs multiple times
# updates[acc] = T.set_subtensor(acc_s, acc_new) #Faster, but inaccurate when an index occurs multiple times
updates[acc] = T.inc_subtensor(T.set_subtensor(acc_s, acc_s * v3)[sample_idx],
v4 * (grad ** 2)) # Slower, but accurate when an index occurs multiple times
acc_new = updates[acc][sample_idx] # Slower, but accurate when an index occurs multiple times
# meang_new = v1 * meang_s + v2 * grad
# updates[meang] = T.set_subtensor(meang_s, meang_new) #Faster, but inaccurate when an index occurs multiple times
updates[meang] = T.inc_subtensor(T.set_subtensor(meang_s, meang_s * v1)[sample_idx], v2 * (
grad ** 2)) # Slower, but accurate when an index occurs multiple times
meang_new = updates[meang][sample_idx] # Slower, but accurate when an index occurs multiple times
countt_new = countt_s + 1.0
updates[countt] = T.set_subtensor(countt_s, countt_new)
return (meang_new / (1 - v1 ** countt_new)) / (T.sqrt(acc_new / (1 - v1 ** countt_new)) + epsilon)
def adagrad(self, param, grad, updates, sample_idx=None, epsilon=1e-6):
acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
if sample_idx is None:
acc_new = acc + grad ** 2
updates[acc] = acc_new
else:
acc_s = acc[sample_idx]
acc_new = acc_s + grad ** 2
updates[acc] = T.set_subtensor(acc_s, acc_new)
gradient_scaling = T.cast(T.sqrt(acc_new + epsilon), theano.config.floatX)
return grad / gradient_scaling
def adadelta(self, param, grad, updates, sample_idx=None, epsilon=1e-6):
v1 = np.float32(self.adapt_params[0])
v2 = np.float32(1.0 - self.adapt_params[0])
acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
upd = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
if sample_idx is None:
acc_new = v1 * acc + v2 * (grad ** 2)
updates[acc] = acc_new
grad_scaling = (upd + epsilon) / (acc_new + epsilon)
upd_new = v1 * upd + v2 * grad_scaling * (grad ** 2)
updates[upd] = upd_new
else:
acc_s = acc[sample_idx]
# acc_new = v1 * acc_s + v2 * (grad**2) #Faster, but inaccurate when an index occurs multiple times
# updates[acc] = T.set_subtensor(acc_s, acc_new) #Faster, but inaccurate when an index occurs multiple times
updates[acc] = T.inc_subtensor(T.set_subtensor(acc_s, acc_s * v1)[sample_idx],
v2 * (grad ** 2)) # Slower, but accurate when an index occurs multiple times
acc_new = updates[acc][sample_idx] # Slower, but accurate when an index occurs multiple times
upd_s = upd[sample_idx]
grad_scaling = (upd_s + epsilon) / (acc_new + epsilon)
# updates[upd] = T.set_subtensor(upd_s, v1 * upd_s + v2 * grad_scaling * (grad**2)) #Faster, but inaccurate when an index occurs multiple times
updates[upd] = T.inc_subtensor(T.set_subtensor(upd_s, upd_s * v1)[sample_idx], v2 * grad_scaling * (
grad ** 2)) # Slower, but accurate when an index occurs multiple times
gradient_scaling = T.cast(T.sqrt(grad_scaling), theano.config.floatX)
if self.learning_rate != 1.0:
print('Warn: learning_rate is not 1.0 while using adadelta. Setting learning_rate to 1.0')
self.learning_rate = 1.0
return grad * gradient_scaling # Ok, checked
def rmsprop(self, param, grad, updates, sample_idx=None, epsilon=1e-6):
v1 = np.float32(self.adapt_params[0])
v2 = np.float32(1.0 - self.adapt_params[0])
acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
if sample_idx is None:
acc_new = v1 * acc + v2 * grad ** 2
updates[acc] = acc_new
else:
acc_s = acc[sample_idx]
# acc_new = v1 * acc_s + v2 * grad ** 2 #Faster, but inaccurate when an index occurs multiple times
# updates[acc] = T.set_subtensor(acc_s, acc_new) #Faster, but inaccurate when an index occurs multiple times
updates[acc] = T.inc_subtensor(T.set_subtensor(acc_s, acc_s * v1)[sample_idx],
v2 * grad ** 2) # Slower, but accurate when an index occurs multiple times
acc_new = updates[acc][sample_idx] # Slower, but accurate when an index occurs multiple times
gradient_scaling = T.cast(T.sqrt(acc_new + epsilon), theano.config.floatX)
return grad / gradient_scaling
def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6):
grads = [T.grad(cost=cost, wrt=param) for param in params]
sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params]
updates = OrderedDict()
if self.grad_cap > 0:
norm = T.cast(T.sqrt(T.sum([T.sum([T.sum(g ** 2) for g in g_list]) for g_list in grads]) + T.sum(
[T.sum(g ** 2) for g in sgrads])), theano.config.floatX)
grads = [[T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in g_list] for g_list in
grads]
sgrads = [T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in sgrads]
for p_list, g_list in zip(params, grads):
for p, g in zip(p_list, g_list):
if self.adapt == 'adagrad':
g = self.adagrad(p, g, updates)
elif self.adapt == 'rmsprop':
g = self.rmsprop(p, g, updates)
elif self.adapt == 'adadelta':
g = self.adadelta(p, g, updates)
elif self.adapt == 'adam':
g = self.adam(p, g, updates)
if self.momentum > 0:
velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True)
velocity2 = self.momentum * velocity - np.float32(self.learning_rate) * (g + self.lmbd * p)
updates[velocity] = velocity2
updates[p] = p + velocity2
else:
updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32(
self.learning_rate) * g
for i in range(len(sgrads)):
g = sgrads[i]
fullP = full_params[i]
sample_idx = sidxs[i]
sparam = sampled_params[i]
if self.adapt == 'adagrad':
g = self.adagrad(fullP, g, updates, sample_idx)
elif self.adapt == 'rmsprop':
g = self.rmsprop(fullP, g, updates, sample_idx)
elif self.adapt == 'adadelta':
g = self.adadelta(fullP, g, updates, sample_idx)
elif self.adapt == 'adam':
g = self.adam(fullP, g, updates, sample_idx)
if self.lmbd > 0:
delta = np.float32(self.learning_rate) * (g + self.lmbd * sparam)
else:
delta = np.float32(self.learning_rate) * g
if self.momentum > 0:
velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True)
vs = velocity[sample_idx]
velocity2 = self.momentum * vs - delta
updates[velocity] = T.set_subtensor(vs, velocity2)
updates[fullP] = T.inc_subtensor(sparam, velocity2)
else:
updates[fullP] = T.inc_subtensor(sparam, - delta)
return updates
def model(self, X, H, M, R=None, Y=None, drop_p_hidden=0.0, drop_p_embed=0.0, predict=False):
sparams, full_params, sidxs = [], [], []
if self.constrained_embedding:
if Y is not None: X = T.concatenate([X, Y], axis=0)
S = self.Wy[X]
Sx = S[:M]
Sy = S[M:]
y = self.dropout(Sx, drop_p_embed)
H_new = []
start = 0
sparams.append(S)
full_params.append(self.Wy)
sidxs.append(X)
elif self.embedding:
Sx = self.E[X]
y = self.dropout(Sx, drop_p_embed)
H_new = []
start = 0
sparams.append(Sx)
full_params.append(self.E)
sidxs.append(X)
else:
Sx = self.Wx[0][X]
vec = Sx + self.Bh[0]
rz = T.nnet.sigmoid(vec[:, self.layers[0]:] + T.dot(H[0], self.Wrz[0]))
h = self.hidden_activation(T.dot(H[0] * rz[:, :self.layers[0]], self.Wh[0]) + vec[:, :self.layers[0]])
z = rz[:, self.layers[0]:]
h = (1.0 - z) * H[0] + z * h
h = self.dropout(h, drop_p_hidden)
y = h
H_new = [T.switch(R.dimshuffle((0, 'x')), 0, h) if not predict else h]
start = 1
sparams.append(Sx)
full_params.append(self.Wx[0])
sidxs.append(X)
for i in range(start, len(self.layers)):
vec = T.dot(y, self.Wx[i]) + self.Bh[i]
rz = T.nnet.sigmoid(vec[:, self.layers[i]:] + T.dot(H[i], self.Wrz[i]))
h = self.hidden_activation(T.dot(H[i] * rz[:, :self.layers[i]], self.Wh[i]) + vec[:, :self.layers[i]])
z = rz[:, self.layers[i]:]
h = (1.0 - z) * H[i] + z * h
h = self.dropout(h, drop_p_hidden)
y = h
H_new.append(T.switch(R.dimshuffle((0, 'x')), 0, h) if not predict else h)
if Y is not None:
if (not self.constrained_embedding) or predict:
Sy = self.Wy[Y]
sparams.append(Sy)
full_params.append(self.Wy)
sidxs.append(Y)
SBy = self.By[Y]
sparams.append(SBy)
full_params.append(self.By)
sidxs.append(Y)
if predict and self.final_act == 'softmax_logit':
y = self.softmax(T.dot(y, Sy.T) + SBy.flatten())
else:
y = self.final_activation(T.dot(y, Sy.T) + SBy.flatten())
return H_new, y, sparams, full_params, sidxs
else:
if predict and self.final_act == 'softmax_logit':
y = self.softmax(T.dot(y, self.Wy.T) + self.By.flatten())
else:
y = self.final_activation(T.dot(y, self.Wy.T) + self.By.flatten())
return H_new, y, sparams, full_params, sidxs
def generate_neg_samples(self, pop, length):
if self.sample_alpha:
sample = np.searchsorted(pop, np.random.rand(self.n_sample * length))
else:
sample = np.random.choice(self.n_items, size=self.n_sample * length)
if length > 1:
sample = sample.reshape((length, self.n_sample))
return sample
def fit(self, data, sample_store=10000000):
'''
Trains the network.
Parameters
--------
data : pandas.DataFrame
Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
sample_store : int
If additional negative samples are used (n_sample > 0), the efficiency of GPU utilization can be sped up, by precomputing a large batch of negative samples (and recomputing when necessary).
This parameter regulizes the size of this precomputed ID set. Its value is the maximum number of int values (IDs) to be stored. Precomputed IDs are stored in the RAM.
For the most efficient computation, a balance must be found between storing few examples and constantly interrupting GPU computations for a short time vs. computing many examples and interrupting GPU computations for a long time (but rarely).
'''
self.predict = None
self.error_during_train = False
itemids = data[self.item_key].unique()
self.n_items = len(itemids)
self.itemidmap = pd.Series(data=np.arange(self.n_items), index=itemids)
data = pd.merge(data, pd.DataFrame({self.item_key: itemids, 'ItemIdx': self.itemidmap[itemids].values}),
on=self.item_key, how='inner')
offset_sessions = self.init(data)
if self.n_sample:
pop = data.groupby(self.item_key).size()
pop = pop[self.itemidmap.index.values].values ** self.sample_alpha
pop = pop.cumsum() / pop.sum()
pop[-1] = 1
if sample_store:
generate_length = sample_store // self.n_sample
if generate_length <= 1:
sample_store = 0
print('No example store was used')
else:
neg_samples = self.generate_neg_samples(pop, generate_length)
sample_pointer = 0
else:
print('No example store was used')
X = T.ivector()
Y = T.ivector()
M = T.iscalar()
R = T.bvector()
H_new, Y_pred, sparams, full_params, sidxs = self.model(X, self.H, M, R, Y, self.dropout_p_hidden,
self.dropout_p_embed)
cost = (M / self.batch_size) * self.loss_function(Y_pred, M)
params = [self.Wx if self.embedding or self.constrained_embedding else self.Wx[1:], self.Wh, self.Wrz, self.Bh]
updates = self.RMSprop(cost, params, full_params, sparams, sidxs)
for i in range(len(self.H)):
updates[self.H[i]] = H_new[i]
train_function = function(inputs=[X, Y, M, R], outputs=cost, updates=updates, allow_input_downcast=True)
base_order = np.argsort(
data.groupby(self.session_key)[self.time_key].min().values) if self.time_sort else np.arange(
len(offset_sessions) - 1)
data_items = data.ItemIdx.values
for epoch in range(self.n_epochs):
for i in range(len(self.layers)):
self.H[i].set_value(np.zeros((self.batch_size, self.layers[i]), dtype=theano.config.floatX),
borrow=True)
c = []
cc = []
session_idx_arr = np.random.permutation(len(offset_sessions) - 1) if self.train_random_order else base_order
iters = np.arange(self.batch_size)
maxiter = iters.max()
start = offset_sessions[session_idx_arr[iters]]
end = offset_sessions[session_idx_arr[iters] + 1]
finished = False
while not finished:
minlen = (end - start).min()
out_idx = data_items[start]
for i in range(minlen - 1):
in_idx = out_idx
out_idx = data_items[start + i + 1]
if self.n_sample:
if sample_store:
if sample_pointer == generate_length:
neg_samples = self.generate_neg_samples(pop, generate_length)
sample_pointer = 0
sample = neg_samples[sample_pointer]
sample_pointer += 1
else:
sample = self.generate_neg_samples(pop, 1)
y = np.hstack([out_idx, sample])
else:
y = out_idx
if self.n_sample:
if sample_pointer == generate_length:
generate_samples()
sample_pointer = 0
sample_pointer += 1
reset = (start + i + 1 == end - 1)
cost = train_function(in_idx, y, len(iters), reset)
c.append(cost)
cc.append(len(iters))
if np.isnan(cost):
print(str(epoch) + ': NaN error!')
self.error_during_train = True
return
start = start + minlen - 1
finished_mask = (end - start <= 1)
n_finished = finished_mask.sum()
iters[finished_mask] = maxiter + np.arange(1, n_finished + 1)
maxiter += n_finished
valid_mask = (iters < len(offset_sessions) - 1)
n_valid = valid_mask.sum()
if (n_valid == 0) or (n_valid < 2 and self.n_sample == 0):
finished = True
break
mask = finished_mask & valid_mask
sessions = session_idx_arr[iters[mask]]
start[mask] = offset_sessions[sessions]
end[mask] = offset_sessions[sessions + 1]
iters = iters[valid_mask]
start = start[valid_mask]
end = end[valid_mask]
if n_valid < len(valid_mask):
for i in range(len(self.H)):
tmp = self.H[i].get_value(borrow=True)
tmp = tmp[valid_mask]
self.H[i].set_value(tmp, borrow=True)
c = np.array(c)
cc = np.array(cc)
avgc = np.sum(c * cc) / np.sum(cc)
if np.isnan(avgc):
print('Epoch {}: NaN error!'.format(str(epoch)))
self.error_during_train = True
return
print('Epoch{}\tloss: {:.6f}'.format(epoch, avgc))
def predict_next_batch(self, session_ids, input_item_ids, predict_for_item_ids=None, batch=100):
'''
Gives predicton scores for a selected set of items. Can be used in batch mode to predict for multiple independent events (i.e. events of different sessions) at once and thus speed up evaluation.
If the session ID at a given coordinate of the session_ids parameter remains the same during subsequent calls of the function, the corresponding hidden state of the network will be kept intact (i.e. that's how one can predict an item to a session).
If it changes, the hidden state of the network is reset to zeros.
Parameters
--------
session_ids : 1D array
Contains the session IDs of the events of the batch. Its length must equal to the prediction batch size (batch param).
input_item_ids : 1D array
Contains the item IDs of the events of the batch. Every item ID must be must be in the training data of the network. Its length must equal to the prediction batch size (batch param).
predict_for_item_ids : 1D array (optional)
IDs of items for which the network should give prediction scores. Every ID must be in the training set. The default value is None, which means that the network gives prediction on its every output (i.e. for all items in the training set).
batch : int
Prediction batch size.
Returns
--------
out : pandas.DataFrame
Prediction scores for selected items for every event of the batch.
Columns: events of the batch; rows: items. Rows are indexed by the item IDs.
'''
if self.error_during_train: raise Exception
if self.predict is None or self.predict_batch != batch:
self.predict_batch = batch
X = T.ivector()
Y = T.ivector()
M = T.iscalar() if self.constrained_embedding or (predict_for_item_ids is not None) else None
for i in range(len(self.layers)):
self.H[i].set_value(np.zeros((batch, self.layers[i]), dtype=theano.config.floatX), borrow=True)
if predict_for_item_ids is not None:
H_new, yhat, _, _, _ = self.model(X, self.H, M, Y=Y, predict=True)
else:
H_new, yhat, _, _, _ = self.model(X, self.H, M, predict=True)
updatesH = OrderedDict()
for i in range(len(self.H)):
updatesH[self.H[i]] = H_new[i]
if predict_for_item_ids is not None:
if self.constrained_embedding:
self.predict = function(inputs=[X, Y, M], outputs=yhat, updates=updatesH, allow_input_downcast=True)
else:
self.predict = function(inputs=[X, Y], outputs=yhat, updates=updatesH, allow_input_downcast=True)
else:
if self.constrained_embedding:
self.predict = function(inputs=[X, M], outputs=yhat, updates=updatesH, allow_input_downcast=True)
else:
self.predict = function(inputs=[X], outputs=yhat, updates=updatesH, allow_input_downcast=True)
self.current_session = np.ones(batch) * -1
session_change = np.arange(batch)[session_ids != self.current_session]
if len(session_change) > 0:
for i in range(len(self.H)):
tmp = self.H[i].get_value(borrow=True)
tmp[session_change] = 0
self.H[i].set_value(tmp, borrow=True)
self.current_session = session_ids.copy()
in_idxs = self.itemidmap[input_item_ids]
if np.any(np.isnan(in_idxs)):
preds = np.random.randn(len(self.itemidmap), len(in_idxs))
return pd.DataFrame(data=preds, index=self.itemidmap.index)
if predict_for_item_ids is not None:
iIdxs = self.itemidmap[predict_for_item_ids]
if self.constrained_embedding:
preds = np.asarray(self.predict(in_idxs, iIdxs, batch)).T
else:
preds = np.asarray(self.predict(in_idxs, iIdxs)).T
return pd.DataFrame(data=preds, index=predict_for_item_ids)
else:
if self.constrained_embedding:
preds = np.asarray(self.predict(in_idxs, batch)).T
else:
preds = np.asarray(self.predict(in_idxs)).T
return pd.DataFrame(data=preds, index=self.itemidmap.index)
def symbolic_predict(self, X, Y, M, items, batch_size):
if not self.constrained_embedding: M = None
H = []
for i in range(len(self.layers)):
H.append(theano.shared(np.zeros((batch_size, self.layers[i]), dtype=theano.config.floatX)))
if items is not None:
H_new, yhat, _, _, _ = self.model(X, H, M, Y=Y, predict=True)
else:
H_new, yhat, _, _, _ = self.model(X, H, M, predict=True)
updatesH = OrderedDict()
for i in range(len(H)):
updatesH[H[i]] = H_new[i]
return yhat, H, updatesH
class HGRU4Rec:
"""
HGRU4Rec(session_layers, user_layers, n_epochs=10, batch_size=50,
learning_rate=0.05, momentum=0.0,
adapt='adagrad', decay=0.9, grad_cap=0, sigma=0,
dropout_p_hidden_usr=0.0,
dropout_p_hidden_ses=0.0, dropout_p_init=0.0,
init_as_normal=False, reset_after_session=True, loss='top1', hidden_act='tanh', final_act=None,
train_random_order=False, lmbd=0.0,
session_key='SessionId', item_key='ItemId', time_key='Time', user_key='UserId', n_sample=0,
sample_alpha=0.75,
item_embedding=None, init_item_embeddings=None,
user_hidden_bias_mode='init', user_output_bias=False,
user_to_session_act='tanh', seed=42)
Initializes the network.
Parameters
-----------
session_layers : 1D array
list of the number of GRU units in the session layers
user_layers : 1D array
list of the number of GRU units in the user layers
n_epochs : int
number of training epochs (default: 10)
batch_size : int
size of the minibatch, also effect the number of negative samples through minibatch based sampling (default: 50)
dropout_p_hidden_usr : float
probability of dropout of hidden units for the user layers (default: 0.0)
dropout_p_hidden_ses : float
probability of dropout of hidden units for the session layers (default: 0.0)
dropout_p_init : float
probability of dropout of the session-level initialization (default: 0.0)
learning_rate : float
learning rate (default: 0.05)
momentum : float
if not zero, Nesterov momentum will be applied during training with the given strength (default: 0.0)
adapt : None, 'adagrad', 'rmsprop', 'adam', 'adadelta'
sets the appropriate learning rate adaptation strategy, use None for standard SGD (default: 'adagrad')
decay : float
decay parameter for RMSProp, has no effect in other modes (default: 0.9)
grad_cap : float
clip gradients that exceede this value to this value, 0 means no clipping (default: 0.0)
sigma : float
"width" of initialization; either the standard deviation or the min/max of the init interval (with normal and uniform initializations respectively); 0 means adaptive normalization (sigma depends on the size of the weight matrix); (default: 0)
init_as_normal : boolean
False: init from uniform distribution on [-sigma,sigma]; True: init from normal distribution N(0,sigma); (default: False)
reset_after_session : boolean
whether the hidden state is set to zero after a session finished (default: True)
loss : 'top1', 'bpr' or 'cross-entropy'
selects the loss function (default: 'top1')
hidden_act : 'tanh' or 'relu'
selects the activation function on the hidden states (default: 'tanh')
final_act : None, 'linear', 'relu' or 'tanh'
selects the activation function of the final layer where appropriate, None means default (tanh if the loss is brp or top1; softmax for cross-entropy),
cross-entropy is only affeted by 'tanh' where the softmax layers is preceeded by a tanh nonlinearity (default: None)
train_random_order : boolean
whether to randomize the order of sessions in each epoch (default: False)
lmbd : float
coefficient of the L2 regularization (default: 0.0)
session_key : string
header of the session ID column in the input file (default: 'SessionId')
item_key : string
header of the item ID column in the input file (default: 'ItemId')
time_key : string
header of the timestamp column in the input file (default: 'Time')
user_key : string
header of the user column in the input file (default: 'UserId')
n_sample : int
number of additional negative samples to be used (besides the other examples of the minibatch) (default: 0)
sample_alpha : float
the probability of an item used as an additional negative sample is supp^sample_alpha (default: 0.75)
(e.g.: sample_alpha=1 --> popularity based sampling; sample_alpha=0 --> uniform sampling)
item_embedding: int
size of the item embedding vector (default: None)
init_item_embeddings: 2D array or dict
array with the initial values of the embeddings vector of every item,
or dict that maps each item id to its embedding vector (default: None)
user_propagation_mode: string
'init' to use the (last) user hidden state to initialize the (first) session hidden state;
'all' to propagate the user hidden also in input the the (first) session layers. (default: 'init')
user_to_output: boolean
True to propagate the (last) user hidden state in input to the final output layer, False otherwise (default: False)
user_to_session_act: string
activation of the user-to-session initialization network (default: 'tanh')
seed: int
random seed (default: 42)
"""
def __init__(self, session_layers, user_layers, n_epochs=10, batch_size=50, learning_rate=0.05, momentum=0.0,
adapt='adagrad', decay=0.9, grad_cap=0, sigma=0, dropout_p_hidden_usr=0.0,
dropout_p_hidden_ses=0.0, dropout_p_init=0.0, init_as_normal=False,
reset_after_session=True, loss='top1', hidden_act='tanh', final_act=None, train_random_order=False,
lmbd=0.0, session_key='SessionId', item_key='ItemId', time_key='Time', user_key='UserId', n_sample=0,
sample_alpha=0.75, item_embedding=None, init_item_embeddings=None, user_propagation_mode='init',
user_to_output=False, user_to_session_act='tanh', seed=42):
self.session_layers = session_layers
self.user_layers = user_layers
self.n_epochs = n_epochs
self.batch_size = batch_size
self.dropout_p_hidden_usr = dropout_p_hidden_usr
self.dropout_p_hidden_ses = dropout_p_hidden_ses
self.dropout_p_init = dropout_p_init
self.learning_rate = learning_rate
self.decay = decay
self.momentum = momentum
self.sigma = sigma
self.init_as_normal = init_as_normal
self.reset_after_session = reset_after_session
self.session_key = session_key
self.item_key = item_key
self.time_key = time_key
self.user_key = user_key
self.grad_cap = grad_cap
self.train_random_order = train_random_order
self.lmbd = lmbd
self.user_propagation_mode = user_propagation_mode
self.user_to_output = user_to_output
self.item_embedding = item_embedding
self.init_item_embeddings = init_item_embeddings
self.rng = np.random.RandomState(seed=seed)
if adapt == 'rmsprop':
self.adapt = 'rmsprop'
elif adapt == 'adagrad':
self.adapt = 'adagrad'
elif adapt == 'adadelta':
self.adapt = 'adadelta'
elif adapt == 'adam':
self.adapt = 'adam'
else:
self.adapt = False
if loss == 'cross-entropy':
if final_act == 'tanh':
self.final_activation = self.softmaxth
else:
self.final_activation = self.softmax
self.loss_function = self.cross_entropy
elif loss == 'bpr':
if final_act == 'linear':
self.final_activation = self.linear
elif final_act == 'relu':
self.final_activation = self.relu
else:
self.final_activation = self.tanh
self.loss_function = self.bpr
elif loss == 'top1':
if final_act == 'linear':
self.final_activation = self.linear
elif final_act == 'relu':
self.final_activation = self.relu
else:
self.final_activation = self.tanh
self.loss_function = self.top1
else:
raise NotImplementedError('loss {} not implemented'.format(loss))
if hidden_act == 'relu':
self.hidden_activation = self.relu
elif hidden_act == 'tanh':
self.hidden_activation = self.tanh
else:
raise NotImplementedError('hidden activation {} not implemented'.format(hidden_act))
if user_to_session_act == 'relu':
self.s_init_act = self.relu
elif user_to_session_act == 'tanh':
self.s_init_act = self.tanh
else:
raise NotImplementedError('user-to-session activation {} not implemented'.format(hidden_act))
self.n_sample = n_sample
self.sample_alpha = sample_alpha
######################ACTIVATION FUNCTIONS#####################
def linear(self, X):
return X
def tanh(self, X):
return T.tanh(X)
def softmax(self, X):
e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')
def softmaxth(self, X):
X = self.tanh(X)
e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')
def relu(self, X):
return T.maximum(X, 0)
def sigmoid(self, X):
return T.nnet.sigmoid(X)
#################################LOSS FUNCTIONS################################
def cross_entropy(self, yhat):
return T.cast(T.mean(-T.log(T.diag(yhat) + 1e-24)), theano.config.floatX)
def bpr(self, yhat):
return T.cast(T.mean(-T.log(T.nnet.sigmoid(T.diag(yhat) - yhat.T))), theano.config.floatX)
def top1(self, yhat):
yhatT = yhat.T
return T.cast(T.mean(
T.mean(T.nnet.sigmoid(-T.diag(yhat) + yhatT) + T.nnet.sigmoid(yhatT ** 2), axis=0) - T.nnet.sigmoid(
T.diag(yhat) ** 2) / self.batch_size), theano.config.floatX)
###############################################################################
def floatX(self, X):
return np.asarray(X, dtype=theano.config.floatX)
def init_weights(self, shape):
sigma = self.sigma if self.sigma != 0 else np.sqrt(6.0 / (shape[0] + shape[1]))
if self.init_as_normal:
return theano.shared(self.floatX(self.rng.randn(*shape) * sigma), borrow=True)
else:
return theano.shared(self.floatX(self.rng.rand(*shape) * sigma * 2 - sigma), borrow=True)
def init_matrix(self, shape):
sigma = self.sigma if self.sigma != 0 else np.sqrt(6.0 / (shape[0] + shape[1]))
if self.init_as_normal:
return self.floatX(self.rng.randn(*shape) * sigma)
else:
return self.floatX(self.rng.rand(*shape) * sigma * 2 - sigma)
def extend_weights(self, W, n_new):
matrix = W.get_value()
sigma = self.sigma if self.sigma != 0 else np.sqrt(6.0 / (matrix.shape[0] + matrix.shape[1] + n_new))
if self.init_as_normal:
new_rows = self.floatX(self.rng.randn(n_new, matrix.shape[1]) * sigma)
else:
new_rows = self.floatX(self.rng.rand(n_new, matrix.shape[1]) * sigma * 2 - sigma)
W.set_value(np.vstack([matrix, new_rows]))
def set_item_embeddings(self, E, values):
if isinstance(values, dict):
keys, values = values.keys(), np.vstack(list(values.values()))
elif isinstance(values, np.ndarray):
# use item ids ranging from 0 to the number of rows in values
keys, values = np.arange(values.shape[0]), values
else:
raise NotImplementedError('Unsupported type')
# map item ids to the internal indices
mask = np.in1d(keys, self.itemidmap.index, assume_unique=True)
idx = self.itemidmap[keys].dropna().values.astype(np.int)
emb = E.get_value()
emb[idx] = values[mask]
E.set_value(emb)
def preprocess_data(self, data):
# sort by user and time key in order
data.sort_values([self.user_key, self.session_key, self.time_key], inplace=True)
data.reset_index(drop=True, inplace=True)
offset_session = np.r_[0, data.groupby([self.user_key, self.session_key], sort=False).size().cumsum()[:-1]]
user_indptr = np.r_[0, data.groupby(self.user_key, sort=False)[self.session_key].nunique().cumsum()[:-1]]
return user_indptr, offset_session
def save_state(self):
state = OrderedDict()
for i in range(len(self.session_layers)):
state['Ws_in_' + str(i)] = self.Ws_in[i].get_value()
state['Ws_hh_' + str(i)] = self.Ws_hh[i].get_value()
state['Ws_rz_' + str(i)] = self.Ws_rz[i].get_value()
state['Bs_h_' + str(i)] = self.Bs_h[i].get_value()
state['Hs_' + str(i)] = self.Hs[i].get_value()
state['Wsy'] = self.Wsy.get_value()
state['By'] = self.By.get_value()
for i in range(len(self.user_layers)):
state['Wu_in_' + str(i)] = self.Wu_in[i].get_value()
state['Wu_hh_' + str(i)] = self.Wu_hh[i].get_value()
state['Wu_rz_' + str(i)] = self.Wu_rz[i].get_value()
state['Bu_h_' + str(i)] = self.Bu_h[i].get_value()
state['Hu_' + str(i)] = self.Hu[i].get_value()
if self.user_to_output:
state['Wuy'] = self.Wuy.get_value()
state['Wu_to_s_init'] = self.Ws_init[0].get_value()
state['Bu_to_s_init'] = self.Bs_init[0].get_value()
if self.user_propagation_mode == 'all':
state['Wu_to_s'] = self.Wu_to_s[0].get_value()
return state
def load_state(self, state):
for i in range(len(self.session_layers)):
self.Ws_in[i].set_value(state['Ws_in_' + str(i)], borrow=True)
self.Ws_hh[i].set_value(state['Ws_hh_' + str(i)], borrow=True)
self.Ws_rz[i].set_value(state['Ws_rz_' + str(i)], borrow=True)
self.Bs_h[i].set_value(state['Bs_h_' + str(i)], borrow=True)
self.Hs[i].set_value(state['Hs_' + str(i)], borrow=True)
self.Wsy.set_value(state['Wsy'], borrow=True)
self.By.set_value(state['By'], borrow=True)
for i in range(len(self.user_layers)):
self.Wu_in[i].set_value(state['Wu_in_' + str(i)], borrow=True)
self.Wu_hh[i].set_value(state['Wu_hh_' + str(i)], borrow=True)
self.Wu_rz[i].set_value(state['Wu_rz_' + str(i)], borrow=True)
self.Bu_h[i].set_value(state['Bu_h_' + str(i)], borrow=True)
self.Hu[i].set_value(state['Hu_' + str(i)], borrow=True)
if self.user_to_output:
self.Wuy.set_value(state['Wuy'], borrow=True)
self.Ws_init[0].set_value(state['Wu_to_s_init'], borrow=True)
self.Bs_init[0].set_value(state['Bu_to_s_init'], borrow=True)
if self.user_propagation_mode == 'all':
self.Wu_to_s[0].set_value(state['Wu_to_s'], borrow=True)
def print_state(self):
for i in range(len(self.session_layers)):
print_norm(self.Ws_in[i], 'Ws_in_' + str(i))
print_norm(self.Ws_hh[i], 'Ws_hh_' + str(i))
print_norm(self.Ws_rz[i], 'Ws_rz_' + str(i))
print_norm(self.Bs_h[i], 'Bs_h_' + str(i))
print_norm(self.Hs[i], 'Hs_' + str(i))
print_norm(self.Wsy, 'Wsy')
print_norm(self.By, 'By')
for i in range(len(self.user_layers)):
print_norm(self.Wu_in[i], 'Wu_in_' + str(i))
print_norm(self.Wu_hh[i], 'Wu_hh_' + str(i))
print_norm(self.Wu_rz[i], 'Wu_rz_' + str(i))
print_norm(self.Bu_h[i], 'Bu_h_' + str(i))
print_norm(self.Hu[i], 'Hu_' + str(i))
if self.user_to_output:
print_norm(self.Wuy, 'Wuy')
print_norm(self.Ws_init[0], 'Wu_to_s_init')
print_norm(self.Bs_init[0], 'Bu_to_s_init')
if self.user_propagation_mode == 'all':
print_norm(self.Wu_to_s[0], 'Wu_to_s')
def init(self):
rnn_input_size = self.n_items
if self.item_embedding is not None:
self.E_item = self.init_weights((self.n_items, self.item_embedding))
if self.init_item_embeddings is not None:
self.set_item_embeddings(self.E_item, self.init_item_embeddings)
rnn_input_size = self.item_embedding
# Initialize the session parameters
self.Ws_in, self.Ws_hh, self.Ws_rz, self.Bs_h, self.Hs = [], [], [], [], []
for i in range(len(self.session_layers)):
m = []
m.append(
self.init_matrix((self.session_layers[i - 1] if i > 0 else rnn_input_size, self.session_layers[i])))
m.append(
self.init_matrix((self.session_layers[i - 1] if i > 0 else rnn_input_size, self.session_layers[i])))
m.append(
self.init_matrix((self.session_layers[i - 1] if i > 0 else rnn_input_size, self.session_layers[i])))
self.Ws_in.append(theano.shared(value=np.hstack(m), borrow=True))
self.Ws_hh.append(self.init_weights((self.session_layers[i], self.session_layers[i])))
m2 = []
m2.append(self.init_matrix((self.session_layers[i], self.session_layers[i])))
m2.append(self.init_matrix((self.session_layers[i], self.session_layers[i])))
self.Ws_rz.append(theano.shared(value=np.hstack(m2), borrow=True))
self.Bs_h.append(
theano.shared(value=np.zeros((self.session_layers[i] * 3,), dtype=theano.config.floatX), borrow=True))
self.Hs.append(
theano.shared(value=np.zeros((self.batch_size, self.session_layers[i]), dtype=theano.config.floatX),
borrow=True))
# Session to output weights
self.Wsy = self.init_weights((self.n_items, self.session_layers[-1]))
# Global output bias
self.By = theano.shared(value=np.zeros((self.n_items, 1), dtype=theano.config.floatX), borrow=True)
# Initialize the user parameters
self.Wu_in, self.Wu_hh, self.Wu_rz, self.Bu_h, self.Hu = [], [], [], [], []
for i in range(len(self.user_layers)):
m = []
m.append(self.init_matrix(
(self.user_layers[i - 1] if i > 0 else self.session_layers[-1], self.user_layers[i])))
m.append(self.init_matrix(
(self.user_layers[i - 1] if i > 0 else self.session_layers[-1], self.user_layers[i])))
m.append(self.init_matrix(
(self.user_layers[i - 1] if i > 0 else self.session_layers[-1], self.user_layers[i])))
self.Wu_in.append(theano.shared(value=np.hstack(m), borrow=True))
self.Wu_hh.append(self.init_weights((self.user_layers[i], self.user_layers[i])))
m2 = []
m2.append(self.init_matrix((self.user_layers[i], self.user_layers[i])))
m2.append(self.init_matrix((self.user_layers[i], self.user_layers[i])))
self.Wu_rz.append(theano.shared(value=np.hstack(m2), borrow=True))
self.Bu_h.append(
theano.shared(value=np.zeros((self.user_layers[i] * 3,), dtype=theano.config.floatX), borrow=True))
self.Hu.append(
theano.shared(value=np.zeros((self.batch_size, self.user_layers[i]), dtype=theano.config.floatX),
borrow=True))
if self.user_to_output:
# User to output weights
self.Wuy = self.init_weights((self.n_items, self.user_layers[-1]))
# User-to-Session parameters
self.Ws_init, self.Bs_init = [], []
self.Ws_init.append(self.init_weights((self.user_layers[-1], self.session_layers[0])))
self.Bs_init.append(
theano.shared(value=np.zeros((self.session_layers[0],), dtype=theano.config.floatX), borrow=True))
if self.user_propagation_mode == 'all':
m = []
m.append(self.init_matrix((self.user_layers[-1], self.session_layers[0])))
m.append(self.init_matrix((self.user_layers[-1], self.session_layers[0])))
m.append(self.init_matrix((self.user_layers[-1], self.session_layers[0])))
self.Wu_to_s = [theano.shared(value=np.hstack(m), borrow=True)]
def dropout(self, X, drop_p):
if drop_p > 0:
retain_prob = 1 - drop_p
X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX) / retain_prob
return X
def adam(self, param, grad, updates, sample_idx=None, epsilon=1e-6):
v1 = np.float32(self.decay)
v2 = np.float32(1.0 - self.decay)
acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
meang = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
countt = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
if sample_idx is None:
acc_new = v1 * acc + v2 * grad ** 2
meang_new = v1 * meang + v2 * grad
countt_new = countt + 1
updates[acc] = acc_new
updates[meang] = meang_new
updates[countt] = countt_new
else:
acc_s = acc[sample_idx]
meang_s = meang[sample_idx]
countt_s = countt[sample_idx]
acc_new = v1 * acc_s + v2 * grad ** 2
meang_new = v1 * meang_s + v2 * grad
countt_new = countt_s + 1.0
updates[acc] = T.set_subtensor(acc_s, acc_new)
updates[meang] = T.set_subtensor(meang_s, meang_new)
updates[countt] = T.set_subtensor(countt_s, countt_new)
return (meang_new / (1 - v1 ** countt_new)) / (T.sqrt(acc_new / (1 - v1 ** countt_new)) + epsilon)
def adagrad(self, param, grad, updates, sample_idx=None, epsilon=1e-6):
acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
if sample_idx is None:
acc_new = acc + grad ** 2
updates[acc] = acc_new
else:
acc_s = acc[sample_idx]
acc_new = acc_s + grad ** 2
updates[acc] = T.set_subtensor(acc_s, acc_new)
gradient_scaling = T.cast(T.sqrt(acc_new + epsilon), theano.config.floatX)
return grad / gradient_scaling
def adadelta(self, param, grad, updates, sample_idx=None, epsilon=1e-6):
v1 = np.float32(self.decay)
v2 = np.float32(1.0 - self.decay)
acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
upd = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
if sample_idx is None:
acc_new = acc + grad ** 2
updates[acc] = acc_new
grad = T.sqrt(upd + epsilon) * grad
upd_new = v1 * upd + v2 * grad ** 2
updates[upd] = upd_new
else:
acc_s = acc[sample_idx]
acc_new = acc_s + grad ** 2
updates[acc] = T.set_subtensor(acc_s, acc_new)
upd_s = upd[sample_idx]
upd_new = v1 * upd_s + v2 * grad ** 2
updates[upd] = T.set_subtensor(upd_s, upd_new)
grad = T.sqrt(upd_s + epsilon) * grad
gradient_scaling = T.cast(T.sqrt(acc_new + epsilon), theano.config.floatX)
return grad / gradient_scaling
def rmsprop(self, param, grad, updates, sample_idx=None, epsilon=1e-6):
v1 = np.float32(self.decay)
v2 = np.float32(1.0 - self.decay)
acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True)
if sample_idx is None:
acc_new = v1 * acc + v2 * grad ** 2
updates[acc] = acc_new
else:
acc_s = acc[sample_idx]
acc_new = v1 * acc_s + v2 * grad ** 2
updates[acc] = T.set_subtensor(acc_s, acc_new)
gradient_scaling = T.cast(T.sqrt(acc_new + epsilon), theano.config.floatX)
return grad / gradient_scaling
def RMSprop(self, cost, params, full_params, sampled_params, sidxs, epsilon=1e-6):
grads = [T.grad(cost=cost, wrt=param) for param in params]
sgrads = [T.grad(cost=cost, wrt=sparam) for sparam in sampled_params]
updates = OrderedDict()
if self.grad_cap > 0:
norm = T.cast(T.sqrt(T.sum([T.sum([T.sum(g ** 2) for g in g_list]) for g_list in grads]) + T.sum(
[T.sum(g ** 2) for g in sgrads])), theano.config.floatX)
grads = [[T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in g_list] for g_list in
grads]
sgrads = [T.switch(T.ge(norm, self.grad_cap), g * self.grad_cap / norm, g) for g in sgrads]
for p_list, g_list in zip(params, grads):
for p, g in zip(p_list, g_list):
if self.adapt:
if self.adapt == 'adagrad':
g = self.adagrad(p, g, updates)
if self.adapt == 'rmsprop':
g = self.rmsprop(p, g, updates)
if self.adapt == 'adadelta':
g = self.adadelta(p, g, updates)
if self.adapt == 'adam':
g = self.adam(p, g, updates)
if self.momentum > 0:
velocity = theano.shared(p.get_value(borrow=False) * 0., borrow=True)
velocity2 = self.momentum * velocity - np.float32(self.learning_rate) * (g + self.lmbd * p)
updates[velocity] = velocity2
updates[p] = p + velocity2
else:
updates[p] = p * np.float32(1.0 - self.learning_rate * self.lmbd) - np.float32(
self.learning_rate) * g
for i in range(len(sgrads)):
g = sgrads[i]
fullP = full_params[i]
sample_idx = sidxs[i]
sparam = sampled_params[i]
if self.adapt:
if self.adapt == 'adagrad':
g = self.adagrad(fullP, g, updates, sample_idx)
if self.adapt == 'rmsprop':
g = self.rmsprop(fullP, g, updates, sample_idx)
if self.adapt == 'adadelta':
g = self.adadelta(fullP, g, updates, sample_idx)
if self.adapt == 'adam':
g = self.adam(fullP, g, updates, sample_idx)
if self.lmbd > 0:
delta = np.float32(self.learning_rate) * (g + self.lmbd * sparam)
else:
delta = np.float32(self.learning_rate) * g
if self.momentum > 0:
velocity = theano.shared(fullP.get_value(borrow=False) * 0., borrow=True)
vs = velocity[sample_idx]
velocity2 = self.momentum * vs - delta
updates[velocity] = T.set_subtensor(vs, velocity2)
updates[fullP] = T.inc_subtensor(sparam, velocity2)
else:
updates[fullP] = T.inc_subtensor(sparam, - delta)
return updates
def model(self, X, Sstart, Ustart, Hs, Hu, Y=None,
drop_p_hidden_usr=0.0,
drop_p_hidden_ses=0.0,
drop_p_init=0.0):
#
# USER GRU
#
# update the User GRU with the last hidden state of the Session GRU
# NOTE: the User GRU gets actually updated only when a new session starts
user_in = T.dot(Hs[-1], self.Wu_in[0]) + self.Bu_h[0]
user_in = user_in.T
# ^ 3 * user_layers[0] x batch_size
rz_u = T.nnet.sigmoid(user_in[self.user_layers[0]:]
+ T.dot(Hu[0], self.Wu_rz[0]).T)
# ^ 2 * user_layers[0] x batch_size
h_u = self.hidden_activation(T.dot(Hu[0] * rz_u[:self.user_layers[0]].T, self.Wu_hh[0]).T
+ user_in[:self.user_layers[0]])
# ^ user_layers[0] x batch_size
z = rz_u[self.user_layers[0]:].T
# batch_size x user_layers[0]
h_u = (1.0 - z) * Hu[0] + z * h_u.T
h_u = self.dropout(h_u, drop_p_hidden_usr)
# ^ batch_size x user_layers[0]
# update the User GRU only when a new session starts
# Hu contains the state of the previous session
h_u = Hu[0] * (1 - Sstart[:, None]) + h_u * Sstart[:, None]
# ^ batch_size x user_layers[0]
# reset the user network state for new users
h_u = T.zeros_like(h_u) * Ustart[:, None] + h_u * (1 - Ustart[:, None])
Hu_new = [h_u]
for i in range(1, len(self.user_layers)):
user_in = T.dot(h_u, self.Wu_in[i]) + self.Bu_h[i]
user_in = user_in.T
rz_u = T.nnet.sigmoid(user_in[self.user_layers[i]:]
+ T.dot(Hu[i], self.Wu_rz[i]).T)
h_u = self.hidden_activation(T.dot(Hu[i] * rz_u[:self.user_layers[i]].T, self.Wu_hh[i]).T
+ user_in[:self.user_layers[i]])
z = rz_u[self.user_layers[i]:].T
h_u = (1.0 - z) * Hu[i] + z * h_u.T
h_u = self.dropout(h_u, drop_p_hidden_usr)
h_u = Hu[i] * (1 - Sstart[:, None]) + h_u * Sstart[:, None]
h_u = T.zeros_like(h_u) * Ustart[:, None] + h_u * (1 - Ustart[:, None])
Hu_new.append(h_u)
#
# SESSION GRU
#
# Process the input items
if self.item_embedding is not None:
# get the item embedding
SE_item = self.E_item[X] # sampled item embedding
vec = T.dot(SE_item, self.Ws_in[0]) + self.Bs_h[0]
Sin = SE_item
else:
Sx = self.Ws_in[0][X]
vec = Sx + self.Bs_h[0]
Sin = Sx
session_in = vec.T
# ^ session_layers[0] x batch_size
# initialize the h_s with h_c only for starting sessions
h_s_init = self.dropout(self.s_init_act(T.dot(h_u, self.Ws_init[0]) + self.Bs_init), drop_p_init)
h_s = Hs[0] * (1 - Sstart[:, None]) + h_s_init * Sstart[:, None]
# reset h_s for starting users
h_s = h_s * (1 - Ustart[:, None]) + T.zeros_like(h_s) * Ustart[:, None]
self.h_s_init = h_s
if self.user_propagation_mode == 'all':
# this propagates the bias throughout all the session
user_bias = T.dot(h_u, self.Wu_to_s[0]).T
# ^ 3*session_layers[0] x batch_size
# update the Session GRU
rz_s = T.nnet.sigmoid(user_bias[self.session_layers[0]:]
+ session_in[self.session_layers[0]:]
+ T.dot(h_s, self.Ws_rz[0]).T)
# ^ 2*session_layers[0] x batch_size
h_s = self.hidden_activation(T.dot(h_s * rz_s[:self.session_layers[0]].T, self.Ws_hh[0]).T
+ session_in[:self.session_layers[0]])
# ^ session_layers[0] x batch_size
else:
rz_s = T.nnet.sigmoid(session_in[self.session_layers[0]:]
+ T.dot(h_s, self.Ws_rz[0]).T)
h_s = self.hidden_activation(T.dot(h_s * rz_s[:self.session_layers[0]].T, self.Ws_hh[0]).T
+ session_in[:self.session_layers[0]])
z = rz_s[self.session_layers[0]:].T
# ^ batch_size x session_layers[0]
h_s = (1.0 - z) * Hs[0] + z * h_s.T
h_s = self.dropout(h_s, drop_p_hidden_ses)
# ^ batch_size x session_layers[0]
Hs_new = [h_s]
for i in range(1, len(self.session_layers)):
session_in = T.dot(h_s, self.Ws_in[i]) + self.Bs_h[i]
session_in = session_in.T
rz_s = T.nnet.sigmoid(session_in[self.session_layers[i]:]
+ T.dot(Hs[i], self.Ws_rz[i]).T)
h_s = self.hidden_activation(T.dot(Hs[i] * rz_s[:self.session_layers[i]].T, self.Ws_hh[i]).T
+ session_in[:self.session_layers[i]])
z = rz_s[self.session_layers[i]:].T
h_s = (1.0 - z) * Hs[i] + z * h_s.T
h_s = self.dropout(h_s, drop_p_hidden_ses)
Hs_new.append(h_s)
self.h_s_new = h_s
if Y is not None:
Ssy = self.Wsy[Y]
SBy = self.By[Y]
preact = T.dot(h_s, Ssy.T) + SBy.flatten()
sampled_params = [Sin, Ssy, SBy]
if self.user_to_output:
Scy = self.Wuy[Y]
preact += T.dot(h_u, Scy.T)
sampled_params.append(Scy)
y = self.final_activation(preact)
return Hs_new, Hu_new, y, sampled_params
else:
preact = T.dot(h_s, self.Wsy.T) + self.By.flatten()
if self.user_to_output:
preact += T.dot(h_u, self.Wuy.T)
y = self.final_activation(preact)
return Hs_new, Hu_new, y, [Sin]
def fit(self, train_data, valid_data=None, retrain=False, sample_store=10000000, patience=3, margin=1.003,
save_to=None, load_from=None):
'''
Trains the network.
Parameters
--------
train_data : pandas.DataFrame
Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
valid_data: pandas.DataFrame
Validation data. If not none, it enables early stopping.
Contains the transactions in the same format as in train_data, and it is used exclusively to compute the loss after each training iteration over train_data.
retrain : boolean
If False, do normal train. If True, do additional train (weights from previous trainings are kept as the initial network) (default: False)
sample_store : int
If additional negative samples are used (n_sample > 0), the efficiency of GPU utilization can be sped up, by precomputing a large batch of negative samples (and recomputing when necessary).
This parameter regulizes the size of this precomputed ID set. Its value is the maximum number of int values (IDs) to be stored. Precomputed IDs are stored in the RAM.
For the most efficient computation, a balance must be found between storing few examples and constantly interrupting GPU computations for a short time vs. computing many examples and interrupting GPU computations for a long time (but rarely).
patience: int
Patience of the early stopping procedure. Number of iterations with not decreasing validation loss before terminating the training procedure
margin: float
Margin of early stopping. Percentage improvement over the current best validation loss to do not incur into a patience penalty
save_to: string
Path where to save the state of the best model resulting from training.
If early stopping is enabled, saves the model with the lowest validation loss. Otherwise, saves the model corresponding to the last iteration.
load_from: string
Path from where to load the state of a previously saved model.
'''
self.predict = None
self.update = None
self.error_during_train = False
itemids = train_data[self.item_key].unique()
self.n_items = len(itemids)
self.init() # initialize the network
if load_from:
logger.info('Resuming from state: {}'.format(load_from))
self.load_state(pickle.load(open(load_from, 'rb')))
if not retrain:
self.itemidmap = pd.Series(data=np.arange(self.n_items), index=itemids)
train_data = pd.merge(train_data,
pd.DataFrame({self.item_key: itemids, 'ItemIdx': self.itemidmap[itemids].values}),
on=self.item_key, how='inner')
user_indptr, offset_sessions = self.preprocess_data(train_data)
else:
raise Exception('Not supported yet!')
if valid_data is not None:
valid_data = pd.merge(valid_data,
pd.DataFrame({self.item_key: itemids, 'ItemIdx': self.itemidmap[itemids].values}),
on=self.item_key, how='inner')
user_indptr_valid, offset_sessions_valid = self.preprocess_data(valid_data)
X, Y = T.ivectors(2)
Sstart, Ustart = T.fvectors(2)
Hs_new, Hu_new, Y_pred, sampled_params = self.model(X, Sstart, Ustart, self.Hs, self.Hu, Y,
drop_p_hidden_usr=self.dropout_p_hidden_usr,
drop_p_hidden_ses=self.dropout_p_hidden_ses,
drop_p_init=self.dropout_p_init)
cost = self.loss_function(Y_pred)
# set up the parameter and sampled parameter vectors
if self.item_embedding is None:
params = [self.Ws_in[1:], self.Ws_hh, self.Ws_rz, self.Bs_h, self.Ws_init, self.Bs_init,
self.Wu_in, self.Wu_hh, self.Wu_rz, self.Bu_h]
full_params = [self.Ws_in[0], self.Wsy, self.By]
else:
params = [self.Ws_in, self.Ws_hh, self.Ws_rz, self.Bs_h, self.Ws_init, self.Bs_init,
self.Wu_in, self.Wu_hh, self.Wu_rz, self.Bu_h]
full_params = [self.E_item, self.Wsy, self.By]
if self.user_propagation_mode == 'all':
params.append(self.Wu_to_s)
sidxs = [X, Y, Y]
if self.user_to_output:
full_params.append(self.Wuy)
sidxs.append(Y)
updates = self.RMSprop(cost, params, full_params, sampled_params, sidxs)
eval_updates = OrderedDict()
# Update the hidden states of the Session GRU
for i in range(len(self.Hs)):
updates[self.Hs[i]] = Hs_new[i]
eval_updates[self.Hs[i]] = Hs_new[i]
# Update the hidden states of the User GRU
for i in range(len(self.Hu)):
updates[self.Hu[i]] = Hu_new[i]
eval_updates[self.Hu[i]] = Hu_new[i]
# Compile the training and evaluation functions
self.train_function = function(inputs=[X, Sstart, Ustart, Y], outputs=cost, updates=updates,
allow_input_downcast=True,
on_unused_input='warn')
self.eval_function = function(inputs=[X, Sstart, Ustart, Y], outputs=cost, updates=eval_updates,
allow_input_downcast=True,
on_unused_input='warn')
# Negative item sampling
if self.n_sample:
self.neg_sampler = Sampler(train_data,
self.n_sample,
rng=self.rng,
item_key=self.item_key,
sample_alpha=self.sample_alpha,
sample_store=sample_store)
# Training starts here
best_valid, best_state = None, None
my_patience = patience
epoch = 0
while epoch < self.n_epochs and my_patience > 0:
train_cost = self.iterate(train_data, self.train_function, offset_sessions, user_indptr)
# self.print_state()
if np.isnan(train_cost):
return
if valid_data is not None:
valid_cost = self.iterate(valid_data, self.eval_function, offset_sessions_valid, user_indptr_valid)
if best_valid is None or valid_cost < best_valid:
best_valid = valid_cost
best_state = self.save_state()
my_patience = patience
elif valid_cost >= best_valid * margin:
my_patience -= 1
logger.info(
'Epoch {} - train cost: {:.4f} - valid cost: {:.4f} (patience: {})'.format(epoch,
train_cost,
valid_cost,
my_patience))
else:
logger.info('Epoch {} - train cost: {:.4f}'.format(epoch, train_cost))
epoch += 1
if my_patience == 0:
logger.info('Early stopping condition met!')
if best_state:
# always load the state associated with the best validation cost
self.load_state(best_state)
if save_to:
if best_state:
state = best_state
else:
state = self.save_state()
logger.info('Saving model to: {}'.format(save_to))
pickle.dump(state, open(save_to, 'wb'), pickle.HIGHEST_PROTOCOL)
def iterate(self, data, fun, offset_sessions, user_indptr, reset_state=True):
if reset_state:
# Reset session layers
for i in range(len(self.session_layers)):
self.Hs[i].set_value(np.zeros((self.batch_size, self.session_layers[i]), dtype=theano.config.floatX),
borrow=True)
# Reset user layers
for i in range(len(self.user_layers)):
self.Hu[i].set_value(np.zeros((self.batch_size, self.user_layers[i]), dtype=theano.config.floatX),
borrow=True)
# variables to manage iterations over users
n_users = len(user_indptr)
offset_users = offset_sessions[user_indptr]
user_idx_arr = np.arange(n_users - 1)
user_iters = np.arange(self.batch_size)
user_maxiter = user_iters.max()
user_start = offset_users[user_idx_arr[user_iters]]
user_end = offset_users[user_idx_arr[user_iters] + 1]
# variables to manage iterations over sessions
session_iters = user_indptr[user_iters]
session_start = offset_sessions[session_iters]
session_end = offset_sessions[session_iters + 1]
sstart = np.zeros((self.batch_size,), dtype=np.float32)
ustart = np.zeros((self.batch_size,), dtype=np.float32)
finished = False
n = 0
c = []
while not finished:
session_minlen = (session_end - session_start).min()
out_idx = data.ItemIdx.values[session_start]
for i in range(session_minlen - 1):
in_idx = out_idx
out_idx = data.ItemIdx.values[session_start + i + 1]
if self.n_sample:
sample = self.neg_sampler.next_sample()
y = np.hstack([out_idx, sample])
else:
y = out_idx
cost = fun(in_idx, sstart, ustart, y)
n += 1
# reset sstart and ustart
sstart = np.zeros_like(sstart, dtype=np.float32)
ustart = np.zeros_like(ustart, dtype=np.float32)
c.append(cost)
if np.isnan(cost):
logger.error('NaN error!')
self.error_during_train = True
return
session_start = session_start + session_minlen - 1
session_start_mask = np.arange(len(session_iters))[(session_end - session_start) <= 1]
sstart[session_start_mask] = 1
for idx in session_start_mask:
session_iters[idx] += 1
if session_iters[idx] + 1 >= len(offset_sessions):
finished = True
break
session_start[idx] = offset_sessions[session_iters[idx]]
session_end[idx] = offset_sessions[session_iters[idx] + 1]
# reset the User hidden state at user change
user_change_mask = np.arange(len(user_iters))[(user_end - session_start <= 0)]
ustart[user_change_mask] = 1
for idx in user_change_mask:
user_maxiter += 1
if user_maxiter + 1 >= len(offset_users):
finished = True
break
user_iters[idx] = user_maxiter
user_start[idx] = offset_users[user_maxiter]
user_end[idx] = offset_users[user_maxiter + 1]
session_iters[idx] = user_indptr[user_maxiter]
session_start[idx] = offset_sessions[session_iters[idx]]
session_end[idx] = offset_sessions[session_iters[idx] + 1]
avgc = np.mean(c)
return avgc
def predict_next_batch(self, session_ids, input_item_ids, input_user_ids,
predict_for_item_ids=None, batch=100):
'''
Gives predicton scores for a selected set of items. Can be used in batch mode to predict for multiple independent events (i.e. events of different sessions) at once and thus speed up evaluation.
If the session ID at a given coordinate of the session_ids parameter remains the same during subsequent calls of the function, the corresponding hidden state of the network will be kept intact (i.e. that's how one can predict an item to a session).
If it changes, the hidden state of the network is reset to zeros.
Parameters
--------
session_ids : 1D array
Contains the session IDs of the events of the batch. Its length must equal to the prediction batch size (batch param).
input_item_ids : 1D array
Contains the item IDs of the events of the batch. Every item ID must be must be in the training data of the network. Its length must equal to the prediction batch size (batch param).
input_user_ids : 1D array
Contains the user IDs of the events of the batch. Every user ID must be must be in the training data of the network. Its length must equal to the prediction batch size (batch param).
predict_for_item_ids : 1D array (optional)
IDs of items for which the network should give prediction scores. Every ID must be in the training set. The default value is None, which means that the network gives prediction on its every output (i.e. for all items in the training set).
batch : int
Prediction batch size.
Returns
--------
out : pandas.DataFrame
Prediction scores for selected items for every event of the batch.
Columns: events of the batch; rows: items. Rows are indexed by the item IDs.
'''
if self.error_during_train: raise Exception
if self.predict is None or self.predict_batch != batch:
X, Y = T.ivectors(2)
Sstart, Ustart = T.fvectors(2)
for i in range(len(self.session_layers)):
self.Hs[i].set_value(np.zeros((batch, self.session_layers[i]), dtype=theano.config.floatX), borrow=True)
for i in range(len(self.user_layers)):
self.Hu[i].set_value(np.zeros((batch, self.user_layers[i]), dtype=theano.config.floatX), borrow=True)
if predict_for_item_ids is not None:
Hs_new, Hu_new, yhat, _ = self.model(X, Sstart, Ustart, self.Hs, self.Hu, Y)
else:
Hs_new, Hu_new, yhat, _ = self.model(X, Sstart, Ustart, self.Hs, self.Hu)
updatesH = OrderedDict()
for i in range(len(self.Hs)):
updatesH[self.Hs[i]] = Hs_new[i]
for i in range(len(self.Hu)):
updatesH[self.Hu[i]] = Hu_new[i]
if predict_for_item_ids is not None:
self.predict = function(inputs=[X, Sstart, Ustart, Y], outputs=yhat, updates=updatesH,
on_unused_input='warn', allow_input_downcast=True)
else:
self.predict = function(inputs=[X, Sstart, Ustart], outputs=yhat, updates=updatesH,
on_unused_input='warn', allow_input_downcast=True)
self.current_session = np.ones(batch) * -1
self.current_users = np.ones(batch) * -1
self.predict_batch = batch
session_change = session_ids != self.current_session
self.current_session = session_ids.copy()
user_change = input_user_ids != self.current_users
self.current_users = input_user_ids.copy()
in_idxs = self.itemidmap[input_item_ids]
if predict_for_item_ids is not None:
iIdxs = self.itemidmap[predict_for_item_ids]
preds = np.asarray(self.predict(in_idxs, session_change, user_change, iIdxs)).T
return pd.DataFrame(data=preds, index=predict_for_item_ids)
else:
preds = np.asarray(self.predict(in_idxs, session_change, user_change)).T
return pd.DataFrame(data=preds, index=self.itemidmap.index)
class RNNRecommender(ISeqRecommender):
"""
A **simplified** interface to Recurrent Neural Network models for Session-based recommendation.
Based on the following two papers:
* Recurrent Neural Networks with Top-k Gains for Session-based Recommendations, Hidasi and Karatzoglou, CIKM 2018
* Personalizing Session-based Recommendation with Hierarchical Recurrent Neural Networks, Quadrana et al, Recsys 2017
"""
def __init__(self,
session_layers,
user_layers=None,
batch_size=32,
learning_rate=0.1,
momentum=0.0,
dropout=None,
epochs=10,
personalized=False):
"""
:param session_layers: number of units per layer used at session level.
It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks.
:param user_layers: number of units per layer used at user level. Required only by personalized models.
It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks.
:param batch_size: the mini-batch size used in training
:param learning_rate: the learning rate used in training (Adagrad optimized)
:param momentum: the momentum coefficient used in training
:param dropout: dropout coefficients.
If personalized=False, it's a float value for the hidden-layer(s) dropout.
If personalized=True, it's a 3-tuple with the values for the dropout of (user hidden, session hidden, user-to-session hidden) layers.
:param epochs: number of training epochs
:param personalized: whether to train a personalized model using the HRNN model.
It will require user ids at prediction time.
"""
super(RNNRecommender).__init__()
if isinstance(session_layers, int):
session_layers = [session_layers]
if isinstance(user_layers, int):
user_layers = [user_layers]
self.session_layers = session_layers
self.user_layers = user_layers
self.batch_size = batch_size
self.learning_rate = learning_rate
self.momentum = momentum
if dropout is None:
if not personalized:
dropout = 0.0
else:
dropout = (0.0, 0.0, 0.0)
self.dropout = dropout
self.epochs = epochs
self.personalized = personalized
self.pseudo_session_id = 0
def __str__(self):
return 'RNNRecommender(' \
'session_layers={session_layers}, ' \
'user_layers={user_layers}, ' \
'batch_size={batch_size}, ' \
'learning_rate={learning_rate}, ' \
'momentum={momentum}, ' \
'dropout={dropout}, ' \
'epochs={epochs}, ' \
'personalized={personalized}, ' \
')'.format(**self.__dict__)
def fit(self, train_data):
self.logger.info('Converting training data to GRU4Rec format')
# parse training data to GRU4Rec format
train_data = dataset_to_gru4rec_format(dataset=train_data)
if not self.personalized:
# fit GRU4Rec
self.model = GRU4Rec(layers=self.session_layers,
n_epochs=self.epochs,
batch_size=self.batch_size,
learning_rate=self.learning_rate,
momentum=self.momentum,
dropout_p_hidden=self.dropout,
session_key='session_id',
item_key='item_id',
time_key='ts')
else:
if self.user_layers is None:
raise ValueError('You should set the value of user_layers before training the personalized model.')
if len(self.dropout) != 3:
raise ValueError('dropout should be a 3 tuple with '
'(user hidden, session hidden, user-to-session hidden) dropout values.')
self.model = HGRU4Rec(session_layers=self.session_layers,
user_layers=self.user_layers,
batch_size=self.batch_size,
n_epochs=self.epochs,
learning_rate=self.learning_rate,
momentum=self.momentum,
dropout_p_hidden_usr=self.dropout[0],
dropout_p_hidden_ses=self.dropout[1],
dropout_p_init=self.dropout[2],
session_key='session_id',
user_key='user_id',
item_key='item_id',
time_key='ts')
self.logger.info('Training started')
self.model.fit(train_data)
self.logger.info('Training completed')
def recommend(self, user_profile, user_id=None):
if not self.personalized:
for item in user_profile:
pred = self.model.predict_next_batch(np.array([self.pseudo_session_id]),
np.array([item]),
batch=1)
else:
if user_id is None:
raise ValueError('user_id required by personalized models')
for item in user_profile:
pred = self.model.predict_next_batch(np.array([self.pseudo_session_id]),
np.array([item]),
np.array([user_id]),
batch=1)
# sort items by predicted score
pred.sort_values(0, ascending=False, inplace=True)
# increase the psuedo-session id so that future call to recommend() won't be connected
self.pseudo_session_id += 1
# convert to the required output format
return [([x.index], x._2) for x in pred.reset_index().itertuples()]
rnnrecommender = RNNRecommender(session_layers=[20],
batch_size=16,
learning_rate=0.1,
momentum=0.1,
dropout=0.1,
epochs=5,
personalized=False)
rnnrecommender.fit(train_data)
Here we fit the recommedation algorithm over the sessions in the training set.
This is a simplified interface to Recurrent Neural Network models for Session-based recommendation. Based on the following two papers:
- Recurrent Neural Networks with Top-k Gains for Session-based Recommendations, Hidasi and Karatzoglou, CIKM 2018
- Personalizing Session-based Recommendation with Hierarchical Recurrent Neural Networks, Quadrana et al, Recsys 2017
In this notebook, we will consider the session-aware (personalized) version of the algorithm. Here's a schematic representation of the model:
Each user session goes through a session RNN, which models short-term user preferences. At the end of each session, the state of the session RNN is used to update a user RNN, which models more long-term user preferences. It's state is passed forward to the next session RNN, which can now personalize recommendations depending on both short-term and long-term user interests.
The hyper-parameters of the model are:
-
session_layers
: number of units per layer used at session level. It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks. -
user_layers
: number of units per layer used at user level. Required only by personalized models. It has to be a list of integers for multi-layer networks, or a integer value for single-layer networks. -
batch_size
: the mini-batch size used in training -
learning_rate
: the learning rate used in training (Adagrad optimized) -
momentum
: the momentum coefficient used in training -
dropout
: it's a 3-tuple with the values for the dropout of (user hidden, session hidden, user-to-session hidden) layers. -
epochs
: number of training epochs -
personalized
: whether to train a personalized model using the HRNN model (True
in this case).
NOTE: HGRU4Rec originally has more hyper-parameters. Going through all of them is out from the scope of this tutorial, but we suggest to check-out the original source code here in case you are interested.
prnnrecommender = RNNRecommender(session_layers=[20],
user_layers=[20],
batch_size=16,
learning_rate=0.5,
momentum=0.1,
dropout=(0.1,0.1,0.1),
epochs=5,
personalized=True)
prnnrecommender.fit(train_data)
The class KNNRecommender
takes the following initialization hyper-parameters:
-
model
: One among the following KNN models:-
iknn
: ItemKNN, item-to-item KNN based on the last item in the session to determine the items to be recommended. -
sknn
: SessionKNN, compares the entire current session with the past sessions in the training data to determine the items to be recommended. -
v-sknn
: VMSessionKNN, use linearly decayed real-valued vectors to encode the current session, then compares the current session with the past sessions in the training data using the dot-product to determine the items to be recommended. -
s-sknn
: SeqSessionKNN, this variant also puts more weight on elements that appear later in the session by using a custom scoring function (see the paper by Ludewng and Jannach). -
sf-sknn
: SeqFilterSessionKNN, this variant also puts more weight on elements that appear later in the session in a more restrictive way by using a custom scoring function (see the paper by Ludewng and Jannach).
-
-
param
init_args
: The model initialization arguments. See the following initializations or checkutil.knn
for more details on each model:-
iknn
: ItemKNN(n_sims=100, lmbd=20, alpha=0.5) -
sknn
: SessionKNN(k, sample_size=500, sampling='recent', similarity='jaccard', remind=False, pop_boost=0) -
v-sknn
: VMSessionKNN(k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div', dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score', weighting_time=False, normalize=True) -
s-knn
: SeqSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', weighting='div', remind=False, pop_boost=0, extend=False, normalize=True) -
sf-sknn
: SeqFilterSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0,extend=False, normalize=True)
-
class ItemKNN:
'''
ItemKNN(n_sims = 100, lmbd = 20, alpha = 0.5, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time')
Item-to-item predictor that computes the the similarity to all items to the given item.
Similarity of two items is given by:
.. math::
s_{i,j}=\sum_{s}I\{(s,i)\in D & (s,j)\in D\} / (supp_i+\\lambda)^{\\alpha}(supp_j+\\lambda)^{1-\\alpha}
Parameters
--------
n_sims : int
Only give back non-zero scores to the N most similar items. Should be higher or equal than the cut-off of your evaluation. (Default value: 100)
lmbd : float
Regularization. Discounts the similarity of rare items (incidental co-occurrences). (Default value: 20)
alpha : float
Balance between normalizing with the supports of the two items. 0.5 gives cosine similarity, 1.0 gives confidence (as in association rules).
session_key : string
header of the session ID column in the input file (default: 'SessionId')
item_key : string
header of the item ID column in the input file (default: 'ItemId')
time_key : string
header of the timestamp column in the input file (default: 'Time')
'''
def __init__(self, n_sims=100, lmbd=20, alpha=0.5, session_key='SessionId', item_key='ItemId', time_key='Time'):
self.n_sims = n_sims
self.lmbd = lmbd
self.alpha = alpha
self.item_key = item_key
self.session_key = session_key
self.time_key = time_key
def fit(self, data):
'''
Trains the predictor.
Parameters
--------
data: pandas.DataFrame
Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
'''
data.set_index(np.arange(len(data)), inplace=True)
self.itemids = data[self.item_key].unique()
n_items = len(self.itemids)
data = pd.merge(data, pd.DataFrame({self.item_key: self.itemids, 'ItemIdx': np.arange(len(self.itemids))}),
on=self.item_key, how='inner')
sessionids = data[self.session_key].unique()
data = pd.merge(data, pd.DataFrame({self.session_key: sessionids, 'SessionIdx': np.arange(len(sessionids))}),
on=self.session_key, how='inner')
supp = data.groupby('SessionIdx').size()
session_offsets = np.zeros(len(supp) + 1, dtype=np.int32)
session_offsets[1:] = supp.cumsum()
index_by_sessions = data.sort_values(['SessionIdx', self.time_key]).index.values
supp = data.groupby('ItemIdx').size()
item_offsets = np.zeros(n_items + 1, dtype=np.int32)
item_offsets[1:] = supp.cumsum()
index_by_items = data.sort_values(['ItemIdx', self.time_key]).index.values
self.sims = dict()
for i in range(n_items):
iarray = np.zeros(n_items)
start = item_offsets[i]
end = item_offsets[i + 1]
for e in index_by_items[start:end]:
uidx = data.SessionIdx.values[e]
ustart = session_offsets[uidx]
uend = session_offsets[uidx + 1]
user_events = index_by_sessions[ustart:uend]
iarray[data.ItemIdx.values[user_events]] += 1
iarray[i] = 0
norm = np.power((supp[i] + self.lmbd), self.alpha) * np.power((supp.values + self.lmbd), (1.0 - self.alpha))
norm[norm == 0] = 1
iarray = iarray / norm
indices = np.argsort(iarray)[-1:-1 - self.n_sims:-1]
self.sims[self.itemids[i]] = pd.Series(data=iarray[indices], index=self.itemids[indices])
def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
'''
Gives predicton scores for a selected set of items on how likely they be the next item in the session.
Parameters
--------
session_id : int or string
The session IDs of the event.
input_item_id : int or string
The item ID of the event. Must be in the set of item IDs of the training set.
predict_for_item_ids : 1D array
IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
Returns
--------
out : pandas.Series
Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
'''
if predict_for_item_ids is None:
predict_for_item_ids = self.itemids
preds = np.zeros(len(predict_for_item_ids))
sim_list = self.sims[input_item_id]
mask = np.in1d(predict_for_item_ids, sim_list.index)
preds[mask] = sim_list[predict_for_item_ids[mask]]
return pd.Series(data=preds, index=predict_for_item_ids)
class SeqFilterSessionKNN:
'''
SessionKNN( k, sample_size=500, sampling='recent', similarity = 'jaccard', remind=False, pop_boost=0, session_key = 'SessionId', item_key= 'ItemId')
Parameters
-----------
k : int
Number of neighboring session to calculate the item scores from. (Default value: 100)
sample_size : int
Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500)
sampling : string
String to define the sampling method for sessions (recent, random). (default: recent)
similarity : string
String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard)
remind : bool
Should the last items of the current session be boosted to the top as reminders
pop_boost : int
Push popular items in the neighbor sessions by this factor. (default: 0 to leave out)
extend : bool
Add evaluated sessions to the maps
normalize : bool
Normalize the scores in the end
session_key : string
Header of the session ID column in the input file. (default: 'SessionId')
item_key : string
Header of the item ID column in the input file. (default: 'ItemId')
time_key : string
Header of the timestamp column in the input file. (default: 'Time')
'''
def __init__(self, k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0,
extend=False, normalize=True, session_key='SessionId', item_key='ItemId', time_key='Time'):
self.remind = remind
self.k = k
self.sample_size = sample_size
self.sampling = sampling
self.similarity = similarity
self.pop_boost = pop_boost
self.session_key = session_key
self.item_key = item_key
self.time_key = time_key
self.extend = extend
self.normalize = normalize
# updated while recommending
self.session = -1
self.session_items = []
self.relevant_sessions = set()
# cache relations once at startup
self.session_item_map = dict()
self.item_session_map = dict()
self.session_time = dict()
self.followed_by = dict()
self.sim_time = 0
def fit(self, train, items=None):
'''
Trains the predictor.
Parameters
--------
data: pandas.DataFrame
Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
'''
index_session = train.columns.get_loc(self.session_key)
index_item = train.columns.get_loc(self.item_key)
index_time = train.columns.get_loc(self.time_key)
self.itemids = train[self.item_key].unique()
session = -1
session_items = set()
last_item = -1
time = -1
# cnt = 0
for row in train.itertuples(index=False):
# cache items of sessions
if row[index_session] != session:
if len(session_items) > 0:
self.session_item_map.update({session: session_items})
# cache the last time stamp of the session
self.session_time.update({session: time})
session = row[index_session]
session_items = set()
else:
if last_item != -1: # fill followed by map for filtering of candidate items
if not last_item in self.followed_by:
self.followed_by[last_item] = set()
self.followed_by[last_item].add(row[index_item])
time = row[index_time]
session_items.add(row[index_item])
# cache sessions involving an item
map_is = self.item_session_map.get(row[index_item])
if map_is is None:
map_is = set()
self.item_session_map.update({row[index_item]: map_is})
map_is.add(row[index_session])
last_item = row[index_item]
# Add the last tuple
self.session_item_map.update({session: session_items})
self.session_time.update({session: time})
def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
'''
Gives predicton scores for a selected set of items on how likely they be the next item in the session.
Parameters
--------
session_id : int or string
The session IDs of the event.
input_item_id : int or string
The item ID of the event. Must be in the set of item IDs of the training set.
predict_for_item_ids : 1D array
IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
Returns
--------
out : pandas.Series
Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
'''
# gc.collect()
# process = psutil.Process(os.getpid())
# print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used')
if (self.session != session_id): # new session
if (self.extend):
item_set = set(self.session_items)
self.session_item_map[self.session] = item_set;
for item in item_set:
map_is = self.item_session_map.get(item)
if map_is is None:
map_is = set()
self.item_session_map.update({item: map_is})
map_is.add(self.session)
ts = time.time()
self.session_time.update({self.session: ts})
last_item = -1
for item in self.session_items:
if last_item != -1:
if not last_item in self.followed_by:
self.followed_by[last_item] = set()
self.followed_by[last_item].add(item)
last_item = item
self.session = session_id
self.session_items = list()
self.relevant_sessions = set()
if type == 'view':
self.session_items.append(input_item_id)
if skip:
return
neighbors = self.find_neighbors(set(self.session_items), input_item_id, session_id)
scores = self.score_items(neighbors, input_item_id)
# add some reminders
if self.remind:
reminderScore = 5
takeLastN = 3
cnt = 0
for elem in self.session_items[-takeLastN:]:
cnt = cnt + 1
# reminderScore = reminderScore + (cnt/100)
oldScore = scores.get(elem)
newScore = 0
if oldScore is None:
newScore = reminderScore
else:
newScore = oldScore + reminderScore
# print 'old score ', oldScore
# update the score and add a small number for the position
newScore = (newScore * reminderScore) + (cnt / 100)
scores.update({elem: newScore})
# push popular ones
if self.pop_boost > 0:
pop = self.item_pop(neighbors)
# Iterate over the item neighbors
# print itemScores
for key in scores:
item_pop = pop.get(key)
# Gives some minimal MRR boost?
scores.update({key: (scores[key] + (self.pop_boost * item_pop))})
# Create things in the format ..
if predict_for_item_ids is None:
predict_for_item_ids = self.itemids
predictions = np.zeros(len(predict_for_item_ids))
mask = np.in1d(predict_for_item_ids, list(scores.keys()))
items = predict_for_item_ids[mask]
values = [scores[x] for x in items]
predictions[mask] = values
series = pd.Series(data=predictions, index=predict_for_item_ids)
if self.normalize:
series = series / series.max()
return series
def item_pop(self, sessions):
'''
Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids)
Parameters
--------
sessions: set
Returns
--------
out : dict
'''
result = dict()
max_pop = 0
for session, weight in sessions:
items = self.items_for_session(session)
for item in items:
count = result.get(item)
if count is None:
result.update({item: 1})
else:
result.update({item: count + 1})
if (result.get(item) > max_pop):
max_pop = result.get(item)
for key in result:
result.update({key: (result[key] / max_pop)})
return result
def jaccard(self, first, second):
'''
Calculates the jaccard index for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
sc = time.clock()
intersection = len(first & second)
union = len(first | second)
res = intersection / union
self.sim_time += (time.clock() - sc)
return res
def cosine(self, first, second):
'''
Calculates the cosine similarity for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
li = len(first & second)
la = len(first)
lb = len(second)
result = li / sqrt(la) * sqrt(lb)
return result
def tanimoto(self, first, second):
'''
Calculates the cosine tanimoto similarity for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
li = len(first & second)
la = len(first)
lb = len(second)
result = li / (la + lb - li)
return result
def binary(self, first, second):
'''
Calculates the ? for 2 sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
a = len(first & second)
b = len(first)
c = len(second)
result = (2 * a) / ((2 * a) + b + c)
return result
def items_for_session(self, session):
'''
Returns all items in the session
Parameters
--------
session: Id of a session
Returns
--------
out : set
'''
return self.session_item_map.get(session);
def sessions_for_item(self, item_id):
'''
Returns all session for an item
Parameters
--------
item: Id of the item session
Returns
--------
out : set
'''
return self.item_session_map.get(item_id)
def most_recent_sessions(self, sessions, number):
'''
Find the most recent sessions in the given set
Parameters
--------
sessions: set of session ids
Returns
--------
out : set
'''
sample = set()
tuples = list()
for session in sessions:
time = self.session_time.get(session)
if time is None:
print(' EMPTY TIMESTAMP!! ', session)
tuples.append((session, time))
tuples = sorted(tuples, key=itemgetter(1), reverse=True)
# print 'sorted list ', sortedList
cnt = 0
for element in tuples:
cnt = cnt + 1
if cnt > number:
break
sample.add(element[0])
# print 'returning sample of size ', len(sample)
return sample
def possible_neighbor_sessions(self, session_items, input_item_id, session_id):
'''
Find a set of session to later on find neighbors in.
A self.sample_size of 0 uses all sessions in which any item of the current session appears.
self.sampling can be performed with the options "recent" or "random".
"recent" selects the self.sample_size most recent sessions while "random" just choses randomly.
Parameters
--------
sessions: set of session ids
Returns
--------
out : set
'''
self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id);
if self.sample_size == 0: # use all session as possible neighbors
print('!!!!! runnig KNN without a sample size (check config)')
return self.relevant_sessions
else: # sample some sessions
self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id);
if len(self.relevant_sessions) > self.sample_size:
if self.sampling == 'recent':
sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size)
elif self.sampling == 'random':
sample = random.sample(self.relevant_sessions, self.sample_size)
else:
sample = self.relevant_sessions[:self.sample_size]
return sample
else:
return self.relevant_sessions
def calc_similarity(self, session_items, sessions):
'''
Calculates the configured similarity for the items in session_items and each session in sessions.
Parameters
--------
session_items: set of item ids
sessions: list of session ids
Returns
--------
out : list of tuple (session_id,similarity)
'''
# print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric
neighbors = []
cnt = 0
for session in sessions:
cnt = cnt + 1
# get items of the session, look up the cache first
session_items_test = self.items_for_session(session)
similarity = getattr(self, self.similarity)(session_items_test, session_items)
if similarity > 0:
neighbors.append((session, similarity))
return neighbors
# -----------------
# Find a set of neighbors, returns a list of tuples (sessionid: similarity)
# -----------------
def find_neighbors(self, session_items, input_item_id, session_id):
'''
Finds the k nearest neighbors for the given session_id and the current item input_item_id.
Parameters
--------
session_items: set of item ids
input_item_id: int
session_id: int
Returns
--------
out : list of tuple (session_id, similarity)
'''
possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id)
possible_neighbors = self.calc_similarity(session_items, possible_neighbors)
possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1])
possible_neighbors = possible_neighbors[:self.k]
return possible_neighbors
def score_items(self, neighbors, input_item_id):
'''
Compute a set of scores for all items given a set of neighbors.
Parameters
--------
neighbors: set of session ids
Returns
--------
out : list of tuple (item, score)
'''
# now we have the set of relevant items to make predictions
scores = dict()
# iterate over the sessions
for session in neighbors:
# get the items in this session
items = self.items_for_session(session[0])
for item in items:
if input_item_id in self.followed_by and item in self.followed_by[
input_item_id]: # hard filter the candidates
old_score = scores.get(item)
new_score = session[1]
if old_score is None:
scores.update({item: new_score})
else:
new_score = old_score + new_score
scores.update({item: new_score})
return scores
class VMSessionKNN:
'''
VMSessionKNN( k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div', dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score', weighting_time=False, normalize=True, session_key = 'SessionId', item_key= 'ItemId', time_key= 'Time')
Parameters
-----------
k : int
Number of neighboring session to calculate the item scores from. (Default value: 100)
sample_size : int
Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500)
sampling : string
String to define the sampling method for sessions (recent, random). (default: recent)
similarity : string
String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard)
weighting : string
Decay function to determine the importance/weight of individual actions in the current session (linear, same, div, log, quadratic). (default: div)
weighting_score : string
Decay function to lower the score of candidate items from a neighboring sessions that were selected by less recently clicked items in the current session. (linear, same, div, log, quadratic). (default: div_score)
weighting_time : boolean
Experimental function to give less weight to items from older sessions (default: False)
dwelling_time : boolean
Experimental function to use the dwelling time for item view actions as a weight in the similarity calculation. (default: False)
last_n_days : int
Use only data from the last N days. (default: None)
last_n_clicks : int
Use only the last N clicks of the current session when recommending. (default: None)
extend : bool
Add evaluated sessions to the maps.
normalize : bool
Normalize the scores in the end.
session_key : string
Header of the session ID column in the input file. (default: 'SessionId')
item_key : string
Header of the item ID column in the input file. (default: 'ItemId')
time_key : string
Header of the timestamp column in the input file. (default: 'Time')
'''
def __init__(self, k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div',
dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score',
weighting_time=False, normalize=True, session_key='SessionId', item_key='ItemId', time_key='Time'):
self.k = k
self.sample_size = sample_size
self.sampling = sampling
self.weighting = weighting
self.dwelling_time = dwelling_time
self.weighting_score = weighting_score
self.weighting_time = weighting_time
self.similarity = similarity
self.session_key = session_key
self.item_key = item_key
self.time_key = time_key
self.extend = extend
self.normalize = normalize
self.last_n_days = last_n_days
self.last_n_clicks = last_n_clicks
# updated while recommending
self.session = -1
self.session_items = []
self.relevant_sessions = set()
# cache relations once at startup
self.session_item_map = dict()
self.item_session_map = dict()
self.session_time = dict()
self.min_time = -1
self.sim_time = 0
def fit(self, data, items=None):
'''
Trains the predictor.
Parameters
--------
data: pandas.DataFrame
Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
'''
if self.last_n_days != None:
max_time = dt.fromtimestamp(data[self.time_key].max())
date_threshold = max_time.date() - td(self.last_n_days)
stamp = dt.combine(date_threshold, dt.min.time()).timestamp()
train = data[data[self.time_key] >= stamp]
else:
train = data
self.num_items = train[self.item_key].max()
index_session = train.columns.get_loc(self.session_key)
index_item = train.columns.get_loc(self.item_key)
index_time = train.columns.get_loc(self.time_key)
self.itemids = train[self.item_key].unique()
session = -1
session_items = set()
time = -1
# cnt = 0
for row in train.itertuples(index=False):
# cache items of sessions
if row[index_session] != session:
if len(session_items) > 0:
self.session_item_map.update({session: session_items})
# cache the last time stamp of the session
self.session_time.update({session: time})
if time < self.min_time:
self.min_time = time
session = row[index_session]
session_items = set()
time = row[index_time]
session_items.add(row[index_item])
# cache sessions involving an item
map_is = self.item_session_map.get(row[index_item])
if map_is is None:
map_is = set()
self.item_session_map.update({row[index_item]: map_is})
map_is.add(row[index_session])
# Add the last tuple
self.session_item_map.update({session: session_items})
self.session_time.update({session: time})
def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
'''
Gives predicton scores for a selected set of items on how likely they be the next item in the session.
Parameters
--------
session_id : int or string
The session IDs of the event.
input_item_id : int or string
The item ID of the event. Must be in the set of item IDs of the training set.
predict_for_item_ids : 1D array
IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
Returns
--------
out : pandas.Series
Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
'''
# gc.collect()
# process = psutil.Process(os.getpid())
# print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used')
if (self.session != session_id): # new session
if (self.extend):
item_set = set(self.session_items)
self.session_item_map[self.session] = item_set;
for item in item_set:
map_is = self.item_session_map.get(item)
if map_is is None:
map_is = set()
self.item_session_map.update({item: map_is})
map_is.add(self.session)
ts = time.time()
self.session_time.update({self.session: ts})
self.last_ts = -1
self.session = session_id
self.session_items = list()
self.dwelling_times = list()
self.relevant_sessions = set()
if type == 'view':
self.session_items.append(input_item_id)
if self.dwelling_time:
if self.last_ts > 0:
self.dwelling_times.append(timestamp - self.last_ts)
self.last_ts = timestamp
if skip:
return
items = self.session_items if self.last_n_clicks is None else self.session_items[-self.last_n_clicks:]
neighbors = self.find_neighbors(items, input_item_id, session_id, self.dwelling_times, timestamp)
scores = self.score_items(neighbors, items, timestamp)
# Create things in the format ..
if predict_for_item_ids is None:
predict_for_item_ids = self.itemids
predictions = np.zeros(len(predict_for_item_ids))
mask = np.in1d(predict_for_item_ids, list(scores.keys()))
items = predict_for_item_ids[mask]
values = [scores[x] for x in items]
predictions[mask] = values
series = pd.Series(data=predictions, index=predict_for_item_ids)
if self.normalize:
series = series / series.max()
return series
def item_pop(self, sessions):
'''
Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids)
Parameters
--------
sessions: set
Returns
--------
out : dict
'''
result = dict()
max_pop = 0
for session, weight in sessions:
items = self.items_for_session(session)
for item in items:
count = result.get(item)
if count is None:
result.update({item: 1})
else:
result.update({item: count + 1})
if (result.get(item) > max_pop):
max_pop = result.get(item)
for key in result:
result.update({key: (result[key] / max_pop)})
return result
def jaccard(self, first, second):
'''
Calculates the jaccard index for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
sc = time.clock()
intersection = len(first & second)
union = len(first | second)
res = intersection / union
self.sim_time += (time.clock() - sc)
return res
def cosine(self, first, second):
'''
Calculates the cosine similarity for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
li = len(first & second)
la = len(first)
lb = len(second)
result = li / sqrt(la) * sqrt(lb)
return result
def tanimoto(self, first, second):
'''
Calculates the cosine tanimoto similarity for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
li = len(first & second)
la = len(first)
lb = len(second)
result = li / (la + lb - li)
return result
def binary(self, first, second):
'''
Calculates the ? for 2 sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
a = len(first & second)
b = len(first)
c = len(second)
result = (2 * a) / ((2 * a) + b + c)
return result
def vec(self, first, second, map):
'''
Calculates the ? for 2 sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
a = first & second
sum = 0
for i in a:
sum += map[i]
result = sum / len(map)
return result
def items_for_session(self, session):
'''
Returns all items in the session
Parameters
--------
session: Id of a session
Returns
--------
out : set
'''
return self.session_item_map.get(session);
def vec_for_session(self, session):
'''
Returns all items in the session
Parameters
--------
session: Id of a session
Returns
--------
out : set
'''
return self.session_vec_map.get(session);
def sessions_for_item(self, item_id):
'''
Returns all session for an item
Parameters
--------
item: Id of the item session
Returns
--------
out : set
'''
return self.item_session_map.get(item_id) if item_id in self.item_session_map else set()
def most_recent_sessions(self, sessions, number):
'''
Find the most recent sessions in the given set
Parameters
--------
sessions: set of session ids
Returns
--------
out : set
'''
sample = set()
tuples = list()
for session in sessions:
time = self.session_time.get(session)
if time is None:
print(' EMPTY TIMESTAMP!! ', session)
tuples.append((session, time))
tuples = sorted(tuples, key=itemgetter(1), reverse=True)
# print 'sorted list ', sortedList
cnt = 0
for element in tuples:
cnt = cnt + 1
if cnt > number:
break
sample.add(element[0])
# print 'returning sample of size ', len(sample)
return sample
def possible_neighbor_sessions(self, session_items, input_item_id, session_id):
'''
Find a set of session to later on find neighbors in.
A self.sample_size of 0 uses all sessions in which any item of the current session appears.
self.sampling can be performed with the options "recent" or "random".
"recent" selects the self.sample_size most recent sessions while "random" just choses randomly.
Parameters
--------
sessions: set of session ids
Returns
--------
out : set
'''
self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id)
if self.sample_size == 0: # use all session as possible neighbors
print('!!!!! runnig KNN without a sample size (check config)')
return self.relevant_sessions
else: # sample some sessions
if len(self.relevant_sessions) > self.sample_size:
if self.sampling == 'recent':
sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size)
elif self.sampling == 'random':
sample = random.sample(self.relevant_sessions, self.sample_size)
else:
sample = self.relevant_sessions[:self.sample_size]
return sample
else:
return self.relevant_sessions
def calc_similarity(self, session_items, sessions, dwelling_times, timestamp):
'''
Calculates the configured similarity for the items in session_items and each session in sessions.
Parameters
--------
session_items: set of item ids
sessions: list of session ids
Returns
--------
out : list of tuple (session_id,similarity)
'''
pos_map = {}
length = len(session_items)
count = 1
for item in session_items:
if self.weighting is not None:
pos_map[item] = getattr(self, self.weighting)(count, length)
count += 1
else:
pos_map[item] = 1
dt = dwelling_times.copy()
dt.append(0)
dt = pd.Series(dt, index=session_items)
dt = dt / dt.max()
# dt[session_items[-1]] = dt.mean() if len(session_items) > 1 else 1
dt[session_items[-1]] = 1
if self.dwelling_time:
# print(dt)
for i in range(len(dt)):
pos_map[session_items[i]] *= dt.iloc[i]
# print(pos_map)
# print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric
items = set(session_items)
neighbors = []
cnt = 0
for session in sessions:
cnt = cnt + 1
# get items of the session, look up the cache first
n_items = self.items_for_session(session)
sts = self.session_time[session]
similarity = self.vec(items, n_items, pos_map)
if similarity > 0:
if self.weighting_time:
diff = timestamp - sts
days = round(diff / 60 / 60 / 24)
decay = pow(7 / 8, days)
similarity *= decay
# print("days:",days," => ",decay)
neighbors.append((session, similarity))
return neighbors
# -----------------
# Find a set of neighbors, returns a list of tuples (sessionid: similarity)
# -----------------
def find_neighbors(self, session_items, input_item_id, session_id, dwelling_times, timestamp):
'''
Finds the k nearest neighbors for the given session_id and the current item input_item_id.
Parameters
--------
session_items: set of item ids
input_item_id: int
session_id: int
Returns
--------
out : list of tuple (session_id, similarity)
'''
possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id)
possible_neighbors = self.calc_similarity(session_items, possible_neighbors, dwelling_times, timestamp)
possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1])
possible_neighbors = possible_neighbors[:self.k]
return possible_neighbors
def score_items(self, neighbors, current_session, timestamp):
'''
Compute a set of scores for all items given a set of neighbors.
Parameters
--------
neighbors: set of session ids
Returns
--------
out : list of tuple (item, score)
'''
# now we have the set of relevant items to make predictions
scores = dict()
# iterate over the sessions
for session in neighbors:
# get the items in this session
items = self.items_for_session(session[0])
step = 1
for item in reversed(current_session):
if item in items:
decay = getattr(self, self.weighting_score)(step)
break
step += 1
for item in items:
old_score = scores.get(item)
similarity = session[1]
if old_score is None:
scores.update({item: (similarity * decay)})
else:
new_score = old_score + (similarity * decay)
scores.update({item: new_score})
return scores
def linear_score(self, i):
return 1 - (0.1 * i) if i <= 100 else 0
def same_score(self, i):
return 1
def div_score(self, i):
return 1 / i
def log_score(self, i):
return 1 / (log10(i + 1.7))
def quadratic_score(self, i):
return 1 / (i * i)
def linear(self, i, length):
return 1 - (0.1 * (length - i)) if i <= 10 else 0
def same(self, i, length):
return 1
def div(self, i, length):
return i / length
def log(self, i, length):
return 1 / (log10((length - i) + 1.7))
def quadratic(self, i, length):
return (i / length) ** 2
class SessionKNN:
'''
SessionKNN( k, sample_size=500, sampling='recent', similarity = 'jaccard', remind=False, pop_boost=0, session_key = 'SessionId', item_key= 'ItemId')
Parameters
-----------
k : int
Number of neighboring session to calculate the item scores from. (Default value: 100)
sample_size : int
Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500)
sampling : string
String to define the sampling method for sessions (recent, random). (default: recent)
similarity : string
String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard)
remind : bool
Should the last items of the current session be boosted to the top as reminders
pop_boost : int
Push popular items in the neighbor sessions by this factor. (default: 0 to leave out)
extend : bool
Add evaluated sessions to the maps
normalize : bool
Normalize the scores in the end
session_key : string
Header of the session ID column in the input file. (default: 'SessionId')
item_key : string
Header of the item ID column in the input file. (default: 'ItemId')
time_key : string
Header of the timestamp column in the input file. (default: 'Time')
'''
def __init__(self, k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0,
extend=False, normalize=True, session_key='SessionId', item_key='ItemId', time_key='Time'):
self.remind = remind
self.k = k
self.sample_size = sample_size
self.sampling = sampling
self.similarity = similarity
self.pop_boost = pop_boost
self.session_key = session_key
self.item_key = item_key
self.time_key = time_key
self.extend = extend
self.normalize = normalize
# updated while recommending
self.session = -1
self.session_items = []
self.relevant_sessions = set()
# cache relations once at startup
self.session_item_map = dict()
self.item_session_map = dict()
self.session_time = dict()
self.sim_time = 0
def fit(self, train):
'''
Trains the predictor.
Parameters
--------
data: pandas.DataFrame
Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
'''
index_session = train.columns.get_loc(self.session_key)
index_item = train.columns.get_loc(self.item_key)
index_time = train.columns.get_loc(self.time_key)
self.itemids = train[self.item_key].unique()
session = -1
session_items = set()
time = -1
# cnt = 0
for row in train.itertuples(index=False):
# cache items of sessions
if row[index_session] != session:
if len(session_items) > 0:
self.session_item_map.update({session: session_items})
# cache the last time stamp of the session
self.session_time.update({session: time})
session = row[index_session]
session_items = set()
time = row[index_time]
session_items.add(row[index_item])
# cache sessions involving an item
map_is = self.item_session_map.get(row[index_item])
if map_is is None:
map_is = set()
self.item_session_map.update({row[index_item]: map_is})
map_is.add(row[index_session])
# Add the last tuple
self.session_item_map.update({session: session_items})
self.session_time.update({session: time})
def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
'''
Gives predicton scores for a selected set of items on how likely they be the next item in the session.
Parameters
--------
session_id : int or string
The session IDs of the event.
input_item_id : int or string
The item ID of the event. Must be in the set of item IDs of the training set.
predict_for_item_ids : 1D array
IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
Returns
--------
out : pandas.Series
Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
'''
# gc.collect()
# process = psutil.Process(os.getpid())
# print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used')
if (self.session != session_id): # new session
if (self.extend):
item_set = set(self.session_items)
self.session_item_map[self.session] = item_set;
for item in item_set:
map_is = self.item_session_map.get(item)
if map_is is None:
map_is = set()
self.item_session_map.update({item: map_is})
map_is.add(self.session)
ts = time.time()
self.session_time.update({self.session: ts})
self.session = session_id
self.session_items = list()
self.relevant_sessions = set()
if type == 'view':
self.session_items.append(input_item_id)
if skip:
return
neighbors = self.find_neighbors(set(self.session_items), input_item_id, session_id)
scores = self.score_items(neighbors)
# add some reminders
if self.remind:
reminderScore = 5
takeLastN = 3
cnt = 0
for elem in self.session_items[-takeLastN:]:
cnt = cnt + 1
# reminderScore = reminderScore + (cnt/100)
oldScore = scores.get(elem)
newScore = 0
if oldScore is None:
newScore = reminderScore
else:
newScore = oldScore + reminderScore
# print 'old score ', oldScore
# update the score and add a small number for the position
newScore = (newScore * reminderScore) + (cnt / 100)
scores.update({elem: newScore})
# push popular ones
if self.pop_boost > 0:
pop = self.item_pop(neighbors)
# Iterate over the item neighbors
# print itemScores
for key in scores:
item_pop = pop.get(key)
# Gives some minimal MRR boost?
scores.update({key: (scores[key] + (self.pop_boost * item_pop))})
# Create things in the format ..
if predict_for_item_ids is None:
predict_for_item_ids = self.itemids
predictions = np.zeros(len(predict_for_item_ids))
mask = np.in1d(predict_for_item_ids, list(scores.keys()))
items = predict_for_item_ids[mask]
values = [scores[x] for x in items]
predictions[mask] = values
series = pd.Series(data=predictions, index=predict_for_item_ids)
if self.normalize:
series = series / series.max()
return series
def item_pop(self, sessions):
'''
Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids)
Parameters
--------
sessions: set
Returns
--------
out : dict
'''
result = dict()
max_pop = 0
for session, weight in sessions:
items = self.items_for_session(session)
for item in items:
count = result.get(item)
if count is None:
result.update({item: 1})
else:
result.update({item: count + 1})
if (result.get(item) > max_pop):
max_pop = result.get(item)
for key in result:
result.update({key: (result[key] / max_pop)})
return result
def jaccard(self, first, second):
'''
Calculates the jaccard index for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
sc = time.clock()
intersection = len(first & second)
union = len(first | second)
res = intersection / union
self.sim_time += (time.clock() - sc)
return res
def cosine(self, first, second):
'''
Calculates the cosine similarity for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
li = len(first & second)
la = len(first)
lb = len(second)
result = li / sqrt(la) * sqrt(lb)
return result
def tanimoto(self, first, second):
'''
Calculates the cosine tanimoto similarity for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
li = len(first & second)
la = len(first)
lb = len(second)
result = li / (la + lb - li)
return result
def binary(self, first, second):
'''
Calculates the ? for 2 sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
a = len(first & second)
b = len(first)
c = len(second)
result = (2 * a) / ((2 * a) + b + c)
return result
def random(self, first, second):
'''
Calculates the ? for 2 sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
return random.random()
def items_for_session(self, session):
'''
Returns all items in the session
Parameters
--------
session: Id of a session
Returns
--------
out : set
'''
return self.session_item_map.get(session);
def sessions_for_item(self, item_id):
'''
Returns all session for an item
Parameters
--------
item: Id of the item session
Returns
--------
out : set
'''
return self.item_session_map.get(item_id)
def most_recent_sessions(self, sessions, number):
'''
Find the most recent sessions in the given set
Parameters
--------
sessions: set of session ids
Returns
--------
out : set
'''
sample = set()
tuples = list()
for session in sessions:
time = self.session_time.get(session)
if time is None:
print(' EMPTY TIMESTAMP!! ', session)
tuples.append((session, time))
tuples = sorted(tuples, key=itemgetter(1), reverse=True)
# print 'sorted list ', sortedList
cnt = 0
for element in tuples:
cnt = cnt + 1
if cnt > number:
break
sample.add(element[0])
# print 'returning sample of size ', len(sample)
return sample
def possible_neighbor_sessions(self, session_items, input_item_id, session_id):
'''
Find a set of session to later on find neighbors in.
A self.sample_size of 0 uses all sessions in which any item of the current session appears.
self.sampling can be performed with the options "recent" or "random".
"recent" selects the self.sample_size most recent sessions while "random" just choses randomly.
Parameters
--------
sessions: set of session ids
Returns
--------
out : set
'''
self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id);
if self.sample_size == 0: # use all session as possible neighbors
print('!!!!! runnig KNN without a sample size (check config)')
return self.relevant_sessions
else: # sample some sessions
self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id);
if len(self.relevant_sessions) > self.sample_size:
if self.sampling == 'recent':
sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size)
elif self.sampling == 'random':
sample = random.sample(self.relevant_sessions, self.sample_size)
else:
sample = self.relevant_sessions[:self.sample_size]
return sample
else:
return self.relevant_sessions
def calc_similarity(self, session_items, sessions):
'''
Calculates the configured similarity for the items in session_items and each session in sessions.
Parameters
--------
session_items: set of item ids
sessions: list of session ids
Returns
--------
out : list of tuple (session_id,similarity)
'''
# print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric
neighbors = []
cnt = 0
for session in sessions:
cnt = cnt + 1
# get items of the session, look up the cache first
session_items_test = self.items_for_session(session)
similarity = getattr(self, self.similarity)(session_items_test, session_items)
if similarity > 0:
neighbors.append((session, similarity))
return neighbors
# -----------------
# Find a set of neighbors, returns a list of tuples (sessionid: similarity)
# -----------------
def find_neighbors(self, session_items, input_item_id, session_id):
'''
Finds the k nearest neighbors for the given session_id and the current item input_item_id.
Parameters
--------
session_items: set of item ids
input_item_id: int
session_id: int
Returns
--------
out : list of tuple (session_id, similarity)
'''
possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id)
possible_neighbors = self.calc_similarity(session_items, possible_neighbors)
possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1])
possible_neighbors = possible_neighbors[:self.k]
return possible_neighbors
def score_items(self, neighbors):
'''
Compute a set of scores for all items given a set of neighbors.
Parameters
--------
neighbors: set of session ids
Returns
--------
out : list of tuple (item, score)
'''
# now we have the set of relevant items to make predictions
scores = dict()
# iterate over the sessions
for session in neighbors:
# get the items in this session
items = self.items_for_session(session[0])
for item in items:
old_score = scores.get(item)
new_score = session[1]
if old_score is None:
scores.update({item: new_score})
else:
new_score = old_score + new_score
scores.update({item: new_score})
return scores
class SeqSessionKNN:
'''
SeqSessionKNN( k, sample_size=500, sampling='recent', similarity = 'jaccard', remind=False, pop_boost=0, session_key = 'SessionId', item_key= 'ItemId')
Parameters
-----------
k : int
Number of neighboring session to calculate the item scores from. (Default value: 100)
sample_size : int
Defines the length of a subset of all training sessions to calculate the nearest neighbors from. (Default value: 500)
sampling : string
String to define the sampling method for sessions (recent, random). (default: recent)
similarity : string
String to define the method for the similarity calculation (jaccard, cosine, binary, tanimoto). (default: jaccard)
remind : bool
Should the last items of the current session be boosted to the top as reminders
pop_boost : int
Push popular items in the neighbor sessions by this factor. (default: 0 to leave out)
extend : bool
Add evaluated sessions to the maps
normalize : bool
Normalize the scores in the end
session_key : string
Header of the session ID column in the input file. (default: 'SessionId')
item_key : string
Header of the item ID column in the input file. (default: 'ItemId')
time_key : string
Header of the timestamp column in the input file. (default: 'Time')
'''
def __init__(self, k, sample_size=1000, sampling='recent', similarity='jaccard', weighting='div', remind=False,
pop_boost=0, extend=False, normalize=True, session_key='SessionId', item_key='ItemId',
time_key='Time'):
self.remind = remind
self.k = k
self.sample_size = sample_size
self.sampling = sampling
self.weighting = weighting
self.similarity = similarity
self.pop_boost = pop_boost
self.session_key = session_key
self.item_key = item_key
self.time_key = time_key
self.extend = extend
self.normalize = normalize
# updated while recommending
self.session = -1
self.session_items = []
self.relevant_sessions = set()
# cache relations once at startup
self.session_item_map = dict()
self.item_session_map = dict()
self.session_time = dict()
self.sim_time = 0
def fit(self, train, items=None):
'''
Trains the predictor.
Parameters
--------
data: pandas.DataFrame
Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps).
It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties).
'''
index_session = train.columns.get_loc(self.session_key)
index_item = train.columns.get_loc(self.item_key)
index_time = train.columns.get_loc(self.time_key)
self.itemids = train[self.item_key].unique()
session = -1
session_items = set()
time = -1
# cnt = 0
for row in train.itertuples(index=False):
# cache items of sessions
if row[index_session] != session:
if len(session_items) > 0:
self.session_item_map.update({session: session_items})
# cache the last time stamp of the session
self.session_time.update({session: time})
session = row[index_session]
session_items = set()
time = row[index_time]
session_items.add(row[index_item])
# cache sessions involving an item
map_is = self.item_session_map.get(row[index_item])
if map_is is None:
map_is = set()
self.item_session_map.update({row[index_item]: map_is})
map_is.add(row[index_session])
# Add the last tuple
self.session_item_map.update({session: session_items})
self.session_time.update({session: time})
def predict_next(self, session_id, input_item_id, predict_for_item_ids=None, skip=False, type='view', timestamp=0):
'''
Gives predicton scores for a selected set of items on how likely they be the next item in the session.
Parameters
--------
session_id : int or string
The session IDs of the event.
input_item_id : int or string
The item ID of the event. Must be in the set of item IDs of the training set.
predict_for_item_ids : 1D array
IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
Returns
--------
out : pandas.Series
Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
'''
# gc.collect()
# process = psutil.Process(os.getpid())
# print( 'cknn.predict_next: ', process.memory_info().rss, ' memory used')
if (self.session != session_id): # new session
if (self.extend):
item_set = set(self.session_items)
self.session_item_map[self.session] = item_set
for item in item_set:
map_is = self.item_session_map.get(item)
if map_is is None:
map_is = set()
self.item_session_map.update({item: map_is})
map_is.add(self.session)
ts = time.time()
self.session_time.update({self.session: ts})
self.session = session_id
self.session_items = list()
self.relevant_sessions = set()
if type == 'view':
self.session_items.append(input_item_id)
if skip:
return
neighbors = self.find_neighbors(set(self.session_items), input_item_id, session_id)
scores = self.score_items(neighbors, self.session_items)
# add some reminders
if self.remind:
reminderScore = 5
takeLastN = 3
cnt = 0
for elem in self.session_items[-takeLastN:]:
cnt = cnt + 1
# reminderScore = reminderScore + (cnt/100)
oldScore = scores.get(elem)
newScore = 0
if oldScore is None:
newScore = reminderScore
else:
newScore = oldScore + reminderScore
# print 'old score ', oldScore
# update the score and add a small number for the position
newScore = (newScore * reminderScore) + (cnt / 100)
scores.update({elem: newScore})
# push popular ones
if self.pop_boost > 0:
pop = self.item_pop(neighbors)
# Iterate over the item neighbors
# print itemScores
for key in scores:
item_pop = pop.get(key)
# Gives some minimal MRR boost?
scores.update({key: (scores[key] + (self.pop_boost * item_pop))})
# Create things in the format ..
if predict_for_item_ids is None:
predict_for_item_ids = self.itemids
predictions = np.zeros(len(predict_for_item_ids))
mask = np.in1d(predict_for_item_ids, list(scores.keys()))
items = predict_for_item_ids[mask]
values = [scores[x] for x in items]
predictions[mask] = values
series = pd.Series(data=predictions, index=predict_for_item_ids)
if self.normalize:
series = series / series.max()
return series
def item_pop(self, sessions):
'''
Returns a dict(item,score) of the item popularity for the given list of sessions (only a set of ids)
Parameters
--------
sessions: set
Returns
--------
out : dict
'''
result = dict()
max_pop = 0
for session, weight in sessions:
items = self.items_for_session(session)
for item in items:
count = result.get(item)
if count is None:
result.update({item: 1})
else:
result.update({item: count + 1})
if (result.get(item) > max_pop):
max_pop = result.get(item)
for key in result:
result.update({key: (result[key] / max_pop)})
return result
def jaccard(self, first, second):
'''
Calculates the jaccard index for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
sc = time.clock()
intersection = len(first & second)
union = len(first | second)
res = intersection / union
self.sim_time += (time.clock() - sc)
return res
def cosine(self, first, second):
'''
Calculates the cosine similarity for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
li = len(first & second)
la = len(first)
lb = len(second)
result = li / sqrt(la) * sqrt(lb)
return result
def tanimoto(self, first, second):
'''
Calculates the cosine tanimoto similarity for two sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
li = len(first & second)
la = len(first)
lb = len(second)
result = li / (la + lb - li)
return result
def binary(self, first, second):
'''
Calculates the ? for 2 sessions
Parameters
--------
first: Id of a session
second: Id of a session
Returns
--------
out : float value
'''
a = len(first & second)
b = len(first)
c = len(second)
result = (2 * a) / ((2 * a) + b + c)
return result
def items_for_session(self, session):
'''
Returns all items in the session
Parameters
--------
session: Id of a session
Returns
--------
out : set
'''
return self.session_item_map.get(session);
def sessions_for_item(self, item_id):
'''
Returns all session for an item
Parameters
--------
item: Id of the item session
Returns
--------
out : set
'''
return self.item_session_map.get(item_id)
def most_recent_sessions(self, sessions, number):
'''
Find the most recent sessions in the given set
Parameters
--------
sessions: set of session ids
Returns
--------
out : set
'''
sample = set()
tuples = list()
for session in sessions:
time = self.session_time.get(session)
if time is None:
print(' EMPTY TIMESTAMP!! ', session)
tuples.append((session, time))
tuples = sorted(tuples, key=itemgetter(1), reverse=True)
# print 'sorted list ', sortedList
cnt = 0
for element in tuples:
cnt = cnt + 1
if cnt > number:
break
sample.add(element[0])
# print 'returning sample of size ', len(sample)
return sample
def possible_neighbor_sessions(self, session_items, input_item_id, session_id):
'''
Find a set of session to later on find neighbors in.
A self.sample_size of 0 uses all sessions in which any item of the current session appears.
self.sampling can be performed with the options "recent" or "random".
"recent" selects the self.sample_size most recent sessions while "random" just choses randomly.
Parameters
--------
sessions: set of session ids
Returns
--------
out : set
'''
self.relevant_sessions = self.relevant_sessions | self.sessions_for_item(input_item_id);
if self.sample_size == 0: # use all session as possible neighbors
print('!!!!! runnig KNN without a sample size (check config)')
return self.relevant_sessions
else: # sample some sessions
if len(self.relevant_sessions) > self.sample_size:
if self.sampling == 'recent':
sample = self.most_recent_sessions(self.relevant_sessions, self.sample_size)
elif self.sampling == 'random':
sample = random.sample(self.relevant_sessions, self.sample_size)
else:
sample = self.relevant_sessions[:self.sample_size]
return sample
else:
return self.relevant_sessions
def calc_similarity(self, session_items, sessions):
'''
Calculates the configured similarity for the items in session_items and each session in sessions.
Parameters
--------
session_items: set of item ids
sessions: list of session ids
Returns
--------
out : list of tuple (session_id,similarity)
'''
# print 'nb of sessions to test ', len(sessionsToTest), ' metric: ', self.metric
neighbors = []
cnt = 0
for session in sessions:
cnt = cnt + 1
# get items of the session, look up the cache first
session_items_test = self.items_for_session(session)
similarity = getattr(self, self.similarity)(session_items_test, session_items)
if similarity > 0:
neighbors.append((session, similarity))
return neighbors
# -----------------
# Find a set of neighbors, returns a list of tuples (sessionid: similarity)
# -----------------
def find_neighbors(self, session_items, input_item_id, session_id):
'''
Finds the k nearest neighbors for the given session_id and the current item input_item_id.
Parameters
--------
session_items: set of item ids
input_item_id: int
session_id: int
Returns
--------
out : list of tuple (session_id, similarity)
'''
possible_neighbors = self.possible_neighbor_sessions(session_items, input_item_id, session_id)
possible_neighbors = self.calc_similarity(session_items, possible_neighbors)
possible_neighbors = sorted(possible_neighbors, reverse=True, key=lambda x: x[1])
possible_neighbors = possible_neighbors[:self.k]
return possible_neighbors
def score_items(self, neighbors, current_session):
'''
Compute a set of scores for all items given a set of neighbors.
Parameters
--------
neighbors: set of session ids
Returns
--------
out : list of tuple (item, score)
'''
# now we have the set of relevant items to make predictions
scores = dict()
# iterate over the sessions
for session in neighbors:
# get the items in this session
items = self.items_for_session(session[0])
step = 1
for item in reversed(current_session):
if item in items:
decay = getattr(self, self.weighting)(step)
break
step += 1
for item in items:
old_score = scores.get(item)
similarity = session[1]
if old_score is None:
scores.update({item: (similarity * decay)})
else:
new_score = old_score + (similarity * decay)
scores.update({item: new_score})
return scores
def linear(self, i):
return 1 - (0.1 * i) if i <= 100 else 0
def same(self, i):
return 1
def div(self, i):
return 1 / i
def log(self, i):
return 1 / (log10(i + 1.7))
def quadratic(self, i):
return 1 / (i * i)
class KNNRecommender(ISeqRecommender):
"""
Interface to ItemKNN and Session-based KNN methods. Based on:
Evaluation of Session-based Recommendation Algorithms, Malte Ludewig and Dietmar Jannach
"""
knn_models = {
'iknn': ItemKNN,
'sknn': SessionKNN,
'v-sknn': VMSessionKNN,
's-sknn': SeqSessionKNN,
'sf-sknn': SeqFilterSessionKNN
}
def __init__(self,
model='cknn',
**init_args):
"""
:param model: One among the following KNN models:
- iknn: ItemKNN, item-to-item KNN based on the *last* item in the session to determine the items to be recommended.
- sknn: SessionKNN, compares the *entire* current session with the past sessions in the training data to
determine the items to be recommended.
- v-sknn: VMSessionKNN, use linearly decayed real-valued vectors to encode the current session,
then compares the current session with the past sessions in the training data using the dot-product
to determine the items to be recommended.
- s-sknn: SeqSessionKNN, this variant also puts more weight on elements that appear later in the session by
using a custom scoring function (see the paper by Ludewng and Jannach).
- sf-sknn: SeqFilterSessionKNN, this variant also puts more weight on elements that appear later in the session
in a more restrictive way by using a custom scoring function (see the paper by Ludewng and Jannach).
:param init_args: The model initialization arguments. See the following initializations or
check `util.knn` for more details on each model:
- iknn: ItemKNN(n_sims=100, lmbd=20, alpha=0.5)
- sknn: SessionKNN(k, sample_size=500, sampling='recent', similarity='jaccard', remind=False, pop_boost=0)
- v-sknn: VMSessionKNN(k, sample_size=1000, sampling='recent', similarity='cosine', weighting='div',
dwelling_time=False, last_n_days=None, last_n_clicks=None, extend=False, weighting_score='div_score',
weighting_time=False, normalize=True)
- s-knn: SeqSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', weighting='div',
remind=False, pop_boost=0, extend=False, normalize=True)
- sf-sknn: SeqFilterSessionKNN(k, sample_size=1000, sampling='recent', similarity='jaccard', remind=False, pop_boost=0,
extend=False, normalize=True)
"""
super(KNNRecommender).__init__()
if model not in self.knn_models:
raise ValueError("Unknown KNN model '{}'. The available ones are: {}".format(
model, list(self.knn_models.keys())
))
self.init_args = init_args
self.init_args.update(dict(session_key='session_id',
item_key='item_id',
time_key='ts'))
self.model = self.knn_models[model](**self.init_args)
self.pseudo_session_id = 0
def __str__(self):
return str(self.model)
def fit(self, train_data):
self.logger.info('Converting training data to GRU4Rec format')
# parse training data to GRU4Rec format
train_data = dataset_to_gru4rec_format(dataset=train_data)
self.logger.info('Training started')
self.model.fit(train_data)
self.logger.info('Training completed')
self.pseudo_session_id = 0
def recommend(self, user_profile, user_id=None):
for item in user_profile:
pred = self.model.predict_next(session_id=self.pseudo_session_id,
input_item_id=item)
# sort items by predicted score
pred.sort_values(0, ascending=False, inplace=True)
# increase the psuedo-session id so that future call to recommend() won't be connected
self.pseudo_session_id += 1
# convert to the required output format
return [([x.index], x._2) for x in pred.reset_index().itertuples()]
knnrecommender = KNNRecommender(model='sknn', k=10)
knnrecommender.fit(train_data)
In the evaluation of sequence-aware recommenders, each sequence in the test set is split into:
- the user profile, used to compute recommendations, is composed by the first k events in the sequence;
- the ground truth, used for performance evaluation, is composed by the remainder of the sequence.
In the cells below, you can control the dimension of the user profile by assigning a positive value to GIVEN_K
, which correspond to the number of events from the beginning of the sequence that will be assigned to the initial user profile. This ensures that each user profile in the test set will have exactly the same initial size, but the size of the ground truth will change for every sequence.
Alternatively, by assigning a negative value to GIVEN_K
, you will set the initial size of the ground truth. In this way the ground truth will have the same size for all sequences, but the dimension of the user profile will differ.
def precision(ground_truth, prediction):
"""
Compute Precision metric
:param ground_truth: the ground truth set or sequence
:param prediction: the predicted set or sequence
:return: the value of the metric
"""
ground_truth = remove_duplicates(ground_truth)
prediction = remove_duplicates(prediction)
precision_score = count_a_in_b_unique(prediction, ground_truth) / float(len(prediction))
assert 0 <= precision_score <= 1
return precision_score
def recall(ground_truth, prediction):
"""
Compute Recall metric
:param ground_truth: the ground truth set or sequence
:param prediction: the predicted set or sequence
:return: the value of the metric
"""
ground_truth = remove_duplicates(ground_truth)
prediction = remove_duplicates(prediction)
recall_score = 0 if len(prediction) == 0 else count_a_in_b_unique(prediction, ground_truth) / float(
len(ground_truth))
assert 0 <= recall_score <= 1
return recall_score
def mrr(ground_truth, prediction):
"""
Compute Mean Reciprocal Rank metric. Reciprocal Rank is set 0 if no predicted item is in contained the ground truth.
:param ground_truth: the ground truth set or sequence
:param prediction: the predicted set or sequence
:return: the value of the metric
"""
rr = 0.
for rank, p in enumerate(prediction):
if p in ground_truth:
rr = 1. / (rank + 1)
break
return rr
def count_a_in_b_unique(a, b):
"""
:param a: list of lists
:param b: list of lists
:return: number of elements of a in b
"""
count = 0
for el in a:
if el in b:
count += 1
return count
def remove_duplicates(l):
return [list(x) for x in set(tuple(x) for x in l)]
METRICS = {'precision':precision,
'recall':recall,
'mrr': mrr}
TOPN=100 # length of the recommendation list
Here we evaluate the quality of the recommendations in a setting in which user profiles are revealed sequentially.
The user profile starts from the first GIVEN_K
events (or, alternatively, from the last -GIVEN_K
events if GIVEN_K<0
).
The recommendations are evaluated against the next LOOK_AHEAD
events (the ground truth).
The user profile is next expanded to the next STEP
events, the ground truth is scrolled forward accordingly, and the evaluation continues until the sequence ends.
In typical next-item recommendation, we start with GIVEN_K=1
, generate a set of alternatives that will evaluated against the next event in the sequence (LOOK_AHEAD=1
), move forward of one step (STEP=1
) and repeat until the sequence ends.
You can set the LOOK_AHEAD='all'
to see what happens if you had to recommend a whole sequence instead of a set of a set of alternatives to a user.
NOTE: Metrics are averaged over each sequence first, then averaged over all test sequences.
GIVEN_K=1, LOOK_AHEAD=1, STEP=1 corresponds to the classical next-item evaluation
def eval_seqreveal(recommender, user_flg=0):
GIVEN_K = 1
LOOK_AHEAD = 1
STEP=1
if user_flg:
test_sequences, test_users = get_test_sequences_and_users(test_data, GIVEN_K, train_data['user_id'].values) # we need user ids now!
print('{} sequences available for evaluation ({} users)'.format(len(test_sequences), len(np.unique(test_users))))
results = sequential_evaluation(recommender,
test_sequences=test_sequences,
users=test_users,
given_k=GIVEN_K,
look_ahead=LOOK_AHEAD,
evaluation_functions=METRICS.values(),
top_n=TOPN,
scroll=True, # scrolling averages metrics over all profile lengths
step=STEP)
else:
test_sequences = get_test_sequences(test_data, GIVEN_K)
print('{} sequences available for evaluation'.format(len(test_sequences)))
results = sequential_evaluation(recommender,
test_sequences=test_sequences,
given_k=GIVEN_K,
look_ahead=LOOK_AHEAD,
evaluation_functions=METRICS.values(),
top_n=TOPN,
scroll=True, # scrolling averages metrics over all profile lengths
step=STEP)
# print('Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})'.format(GIVEN_K, LOOK_AHEAD, STEP))
# for mname, mvalue in zip(METRICS.keys(), results):
# print('\t{}@{}: {:.4f}'.format(mname, TOPN, mvalue))
return [results, GIVEN_K, LOOK_AHEAD, STEP]
for model in [poprecommender, fsmrecommender, mmcrecommender, p2vrecommender,
rnnrecommender, fpmcrecommender, prnnrecommender,
]:
if model in [fpmcrecommender, prnnrecommender]:
results = eval_seqreveal(model, user_flg=1)
# results = eval_staticprofile(model, user_flg=1)
else:
results = eval_seqreveal(model)
wandb.init(name='seqreveal-'+type(model).__name__,
project='SARS Music30 x1',
notes='sequentially revelaed user profile evaluation',
tags=['sequence', 'music', 'seqreveal'])
wandb.log({
"Model": type(model).__name__,
"GIVEN_K": results[1],
"LOOK_AHEAD": results[2],
"STEP": results[3],
"Precision@100": results[0][0],
"Recall@100": results[0][1],
"MRR@100": results[0][2],
})
Here we evaluate the quality of the recommendations in a setting in which user profiles are instead static.
The user profile starts from the first GIVEN_K
events (or, alternatively, from the last -GIVEN_K
events if GIVEN_K<0
).
The recommendations are evaluated against the next LOOK_AHEAD
events (the ground truth).
The user profile is not extended and the ground truth doesn't move forward. This allows to obtain "snapshots" of the recommendation performance for different user profile and ground truth lenghts.
Also here you can set the LOOK_AHEAD='all'
to see what happens if you had to recommend a whole sequence instead of a set of a set of alternatives to a user.
def eval_staticprofile(recommender, user_flg=0):
GIVEN_K = 1
LOOK_AHEAD = 'all'
STEP=1
if user_flg:
test_sequences, test_users = get_test_sequences_and_users(test_data, GIVEN_K, train_data['user_id'].values) # we need user ids now!
print('{} sequences available for evaluation ({} users)'.format(len(test_sequences), len(np.unique(test_users))))
results = sequential_evaluation(recommender,
test_sequences=test_sequences,
users=test_users,
given_k=GIVEN_K,
look_ahead=LOOK_AHEAD,
evaluation_functions=METRICS.values(),
top_n=TOPN,
scroll=False # notice that scrolling is disabled!
)
else:
test_sequences = get_test_sequences(test_data, GIVEN_K)
print('{} sequences available for evaluation'.format(len(test_sequences)))
results = sequential_evaluation(recommender,
test_sequences=test_sequences,
given_k=GIVEN_K,
look_ahead=LOOK_AHEAD,
evaluation_functions=METRICS.values(),
top_n=TOPN,
scroll=False # notice that scrolling is disabled!
)
return [results, GIVEN_K, LOOK_AHEAD, STEP]
for model in [poprecommender, fsmrecommender, mmcrecommender, p2vrecommender,
rnnrecommender, fpmcrecommender, prnnrecommender,
]:
if model in [fpmcrecommender, prnnrecommender]:
results = eval_staticprofile(model, user_flg=1)
else:
results = eval_staticprofile(model)
wandb.init(name='staticprofile-'+type(model).__name__,
project='SARS Music30 x1',
notes='sequentially static user profile evaluation',
tags=['sequence', 'music', 'staticprofile'])
wandb.log({
"Model": type(model).__name__,
"GIVEN_K": results[1],
"LOOK_AHEAD": results[2],
"STEP": results[3],
"Precision@100": results[0][0],
"Recall@100": results[0][1],
"MRR@100": results[0][2],
})
Analysis of next-item recommendation
Here we propose to analyse the performance of the recommender system in the scenario of next-item recommendation over the following dimensions:
- the length of the recommendation list, and
- the length of the user profile.
NOTE: This evaluation is by no means exhaustive, as different the hyper-parameters of the recommendation algorithm should be carefully tuned before drawing any conclusions. Unfortunately, given the time constraints for this tutorial, we had to leave hyper-parameter tuning out. A very useful reference about careful evaluation of (session-based) recommenders can be found at:
- Evaluation of Session-based Recommendation Algorithms, Ludewig and Jannach, 2018 (paper)
for model in [poprecommender, fsmrecommender, mmcrecommender, p2vrecommender,
rnnrecommender, fpmcrecommender, prnnrecommender,
]:
if model in [fpmcrecommender, prnnrecommender]:
results = eval_reclength(model, user_flg=1)
else:
results = eval_reclength(model)
wandb.init(name='plotreclen-'+type(model).__name__,
project='SARS Music30 x1',
notes='rec list length variation evaluation',
tags=['sequence', 'music', 'plotreclen'])
wandb.log({"Precision": results[0][0],
"Recall": results[0][1],
"MRR": results[0][2],
"Model": type(model).__name__,
"GIVEN_K": results[1],
"LOOK_AHEAD": results[2],
"STEP": results[3],
})
def eval_profilelength(recommender, user_flg=0):
given_k_list = [1, 2, 3, 4]
LOOK_AHEAD = 1
STEP = 1
TOPN = 20
res_list = []
if user_flg:
test_sequences, test_users = get_test_sequences_and_users(test_data, max(given_k_list), train_data['user_id'].values) # we need user ids now!
print('{} sequences available for evaluation ({} users)'.format(len(test_sequences), len(np.unique(test_users))))
for gk in given_k_list:
print('Evaluating profiles having length: {}'.format(gk))
res_tmp = sequential_evaluation(recommender,
test_sequences=test_sequences,
users=test_users,
given_k=gk,
look_ahead=LOOK_AHEAD,
evaluation_functions=METRICS.values(),
top_n=TOPN,
scroll=False, # here we stop at each profile length
step=STEP)
mvalues = list(zip(METRICS.keys(), res_tmp))
res_list.append((gk, mvalues))
else:
test_sequences = get_test_sequences(test_data, max(given_k_list))
print('{} sequences available for evaluation'.format(len(test_sequences)))
for gk in given_k_list:
print('Evaluating profiles having length: {}'.format(gk))
res_tmp = sequential_evaluation(recommender,
test_sequences=test_sequences,
given_k=gk,
look_ahead=LOOK_AHEAD,
evaluation_functions=METRICS.values(),
top_n=TOPN,
scroll=False, # here we stop at each profile length
step=STEP)
mvalues = list(zip(METRICS.keys(), res_tmp))
res_list.append((gk, mvalues))
# show separate plots per metric
# fig, axes = plt.subplots(nrows=1, ncols=len(METRICS), figsize=(15,5))
res_list_t = list(zip(*res_list))
results = []
for midx, metric in enumerate(METRICS):
mvalues = [res_list_t[1][j][midx][1] for j in range(len(res_list_t[1]))]
fig, ax = plt.subplots(figsize=(5,5))
ax.plot(given_k_list, mvalues)
ax.set_title(metric)
ax.set_xticks(given_k_list)
ax.set_xlabel('Profile length')
fig.tight_layout()
results.append(fig)
return [results, TOPN, LOOK_AHEAD, STEP]
for model in [poprecommender, fsmrecommender, mmcrecommender, p2vrecommender,
rnnrecommender, fpmcrecommender, prnnrecommender,
]:
if model in [fpmcrecommender, prnnrecommender]:
results = eval_profilelength(model, user_flg=1)
else:
results = eval_profilelength(model)
wandb.init(name='plotproflen-'+type(model).__name__,
project='SARS Music30 x1',
notes='profile length variation evaluation',
tags=['sequence', 'music', 'plotproflen'])
wandb.log({"Precision": results[0][0],
"Recall": results[0][1],
"MRR": results[0][2],
"Model": type(model).__name__,
"TOP_N": results[1],
"LOOK_AHEAD": results[2],
"STEP": results[3],
})
import pickle
run = wandb.init(job_type="model-logging",
name="artifact-model",
project='SARS Music30 x1')
for model in [poprecommender, fsmrecommender, mmcrecommender, p2vrecommender,
rnnrecommender, fpmcrecommender, prnnrecommender, knnrecommender]:
# with open(type(model).__name__+'.p', 'wb') as handle:
# pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
artifact = wandb.Artifact(type(model).__name__, type='model')
artifact.add_file(type(model).__name__+'.p')
run.log_artifact(artifact)
run = wandb.init(job_type="data-logging",
name="artifact-data",
project='SARS Music30 x1')
artifact = wandb.Artifact('datasets', type='dataset')
train_data.name = 'train_dataset'
test_data.name = 'test_dataset'
for dataset in [train_data, test_data]:
dataset.to_csv(dataset.name+'.p', index=False)
artifact.add_file(dataset.name+'.p')
run.log_artifact(artifact)