from .content_based_algo import ContentBasedAlgo
from .base_algo import BaseAlgo
from ..preprocess import PreProcessor
from gensim import models
import pandas as pd
import numpy as np
import pickle as pic
[docs]class InitialParams:
"""
This class contains some necessary data for the initialization of class TopicBasedAlgo.
"""
[docs] def __init__(self, **kwargs):
self.ids = kwargs['ids']
self.id2word = kwargs['id2word']
self.corpus = kwargs['corpus']
[docs] def save(self, fname):
"""
This method save initial params to a file
:param fname: file path
"""
with open(fname, 'wb') as f:
pic.dump({"ids": self.ids, "id2word": self.id2word, "corpus": self.corpus}, f)
[docs] @classmethod
def load(cls, fname):
"""
Load an object previously saved from a file
:param fname: file path
:return: object loaded from file
"""
with open(fname, 'rb') as f:
obj = pic.load(f)
return InitialParams(ids=obj['ids'], id2word=obj['id2word'], corpus=obj['corpus'])
[docs]class TopicBasedAlgo(BaseAlgo):
"""
Content-based algorithm which use "Topic model" algorithms (LSI or LDA).
Use delegation strategy
"""
[docs] def __init__(self, initial_params, topic_n=100, chunksize=100, topic_type='lda', power_iters=2,
extra_samples=100, passes=1):
"""
:param initial_params: An instance of InitialParams generated by :meth:`preprocess \
<topic_based_algo.TopicBasedAlgo.preprocess>`
:param topic_n: The number of requested latent topics to be extracted from the training corpus.
:param chunksize: Number of documents to be used in each training chunk.
:param topic_type: 'lsi' or 'lda'
:param power_iters: (**LSI parameter**)Number of power iteration steps to be used. Increasing the number \
of power iterations improves accuracy, but lowers performance.
:param extra_samples: (**LSI parameter**)Extra samples to be used besides the rank k. Can improve accuracy.
:param passes: (**LDA parameter**)Number of passes through the corpus during training.
"""
super().__init__()
self._item_ids = initial_params.ids
self._id2word = initial_params.id2word
self._corpus = initial_params.corpus
self._topic_n = topic_n
self._topic = topic_type
self._topic_model = None
self._chunksize = chunksize
self._power_iters = power_iters
self._extra_samples = extra_samples
self._passes = passes
self._content_algo = ContentBasedAlgo(self._generate_item_vector(), self._topic_n)
[docs] @classmethod
def preprocess(cls, raw_data):
"""
Call this method to process raw data which contain item id and its content before initializing TopicBasedAlgo \
instance.
:param raw_data: A pandas.DataFrame contains item id and content \| id \| content \|
:return: A :meth:`InitialParams <topic_based_algo.InitialParams>` instance, a necessary parameter in the \
initialization of TopicBasedAlgo.
"""
id_tokens = PreProcessor.generate_tokens(raw_data)
tf_res = PreProcessor.build_tf_idf(id_tokens)
raw_data.columns = ['id', 'content']
return InitialParams(ids=raw_data['id'].values.tolist(), id2word=tf_res['gensim_pack']['id2word'],
corpus=tf_res['gensim_pack']['corpus'])
[docs] def train(self, train_set):
self._content_algo.train(train_set)
return self
[docs] def top_k_recommend(self, u_id, k):
return self._content_algo.top_k_recommend(u_id, k)
[docs] @classmethod
def load(cls, fname):
res = super(TopicBasedAlgo, cls).load(fname)
assert (hasattr(res, '_topic')), 'Not a standard TopicBasedAlgo class.'
topic = getattr(res, '_topic')
if topic == 'lsi':
model_fname = '.'.join([fname, 'lsi'])
setattr(res, '_topic_model', models.LsiModel.load(model_fname))
elif topic == 'lda':
model_fname = '.'.join([fname, 'lda'])
setattr(res, '_topic_model', models.LdaModel.load(model_fname))
setattr(res, '_content_algo', ContentBasedAlgo.load('.'.join([fname, 'content_base'])))
return res
[docs] def save(self, fname, *args):
ignore = ['_topic_model', '_content_algo']
if self._topic_model is not None:
self._topic_model.save('.'.join([fname, self._topic]))
self._content_algo.save('.'.join([fname, 'content_base']))
super().save(fname, ignore)
[docs] def _generate_item_vector(self):
"""
Use LDA or LSI algorithm to process TF-IDF vector and generate new item vectors.
:return: DataFrame contains item id and it's new vector
"""
if self._topic == 'lsi':
self._topic_model = models.LsiModel(corpus=self._corpus, num_topics=self._topic_n,
id2word=self._id2word, chunksize=self._chunksize,
power_iters=self._power_iters, extra_samples=self._extra_samples)
elif self._topic == 'lda':
self._topic_model = models.LdaModel(corpus=self._corpus, num_topics=self._topic_n,
id2word=self._id2word, chunksize=self._chunksize,
update_every=1, passes=self._passes, dtype=np.float64)
else:
raise ValueError(self._topic)
vecs = self._topic_model[self._corpus]
pure_vecs = []
for vec in vecs:
if len(vec) != self._topic_n:
pure_vecs.append(TopicBasedAlgo._rebuild_vector(vec, self._topic_n))
else:
pure_vecs.append([v for (index, v) in vec])
return pd.DataFrame({'item_id': self._item_ids, 'vec': pure_vecs})
[docs] def to_dict(self):
"""
See :meth:`BaseAlgo.to_dict <base_algo.BaseAlgo.to_dict>` for more details.
"""
res = {'type': self._topic, 'topic_num': self._topic_n, 'chunksize': self._chunksize}
if self._topic == 'lsi':
res['power_iters'] = self._power_iters
res['extra_samples'] = self._extra_samples
else:
res['passes'] = self._passes
return res
[docs] @staticmethod
def _rebuild_vector(partial_vector, dim):
res = [0] * dim
for (index, value) in partial_vector:
res[index] = value
return res