Source code for ucas_dm.prediction_algorithms.topic_based_algo

from .content_based_algo import ContentBasedAlgo
from .base_algo import BaseAlgo
from ..preprocess import PreProcessor
from gensim import models
import pandas as pd
import numpy as np
import pickle as pic


[docs]class InitialParams: """ This class contains some necessary data for the initialization of class TopicBasedAlgo. """
[docs] def __init__(self, **kwargs): self.ids = kwargs['ids'] self.id2word = kwargs['id2word'] self.corpus = kwargs['corpus']
[docs] def save(self, fname): """ This method save initial params to a file :param fname: file path """ with open(fname, 'wb') as f: pic.dump({"ids": self.ids, "id2word": self.id2word, "corpus": self.corpus}, f)
[docs] @classmethod def load(cls, fname): """ Load an object previously saved from a file :param fname: file path :return: object loaded from file """ with open(fname, 'rb') as f: obj = pic.load(f) return InitialParams(ids=obj['ids'], id2word=obj['id2word'], corpus=obj['corpus'])
[docs]class TopicBasedAlgo(BaseAlgo): """ Content-based algorithm which use "Topic model" algorithms (LSI or LDA). Use delegation strategy """
[docs] def __init__(self, initial_params, topic_n=100, chunksize=100, topic_type='lda', power_iters=2, extra_samples=100, passes=1): """ :param initial_params: An instance of InitialParams generated by :meth:`preprocess \ <topic_based_algo.TopicBasedAlgo.preprocess>` :param topic_n: The number of requested latent topics to be extracted from the training corpus. :param chunksize: Number of documents to be used in each training chunk. :param topic_type: 'lsi' or 'lda' :param power_iters: (**LSI parameter**)Number of power iteration steps to be used. Increasing the number \ of power iterations improves accuracy, but lowers performance. :param extra_samples: (**LSI parameter**)Extra samples to be used besides the rank k. Can improve accuracy. :param passes: (**LDA parameter**)Number of passes through the corpus during training. """ super().__init__() self._item_ids = initial_params.ids self._id2word = initial_params.id2word self._corpus = initial_params.corpus self._topic_n = topic_n self._topic = topic_type self._topic_model = None self._chunksize = chunksize self._power_iters = power_iters self._extra_samples = extra_samples self._passes = passes self._content_algo = ContentBasedAlgo(self._generate_item_vector(), self._topic_n)
[docs] @classmethod def preprocess(cls, raw_data): """ Call this method to process raw data which contain item id and its content before initializing TopicBasedAlgo \ instance. :param raw_data: A pandas.DataFrame contains item id and content \| id \| content \| :return: A :meth:`InitialParams <topic_based_algo.InitialParams>` instance, a necessary parameter in the \ initialization of TopicBasedAlgo. """ id_tokens = PreProcessor.generate_tokens(raw_data) tf_res = PreProcessor.build_tf_idf(id_tokens) raw_data.columns = ['id', 'content'] return InitialParams(ids=raw_data['id'].values.tolist(), id2word=tf_res['gensim_pack']['id2word'], corpus=tf_res['gensim_pack']['corpus'])
[docs] def train(self, train_set): self._content_algo.train(train_set) return self
[docs] def top_k_recommend(self, u_id, k): return self._content_algo.top_k_recommend(u_id, k)
[docs] @classmethod def load(cls, fname): res = super(TopicBasedAlgo, cls).load(fname) assert (hasattr(res, '_topic')), 'Not a standard TopicBasedAlgo class.' topic = getattr(res, '_topic') if topic == 'lsi': model_fname = '.'.join([fname, 'lsi']) setattr(res, '_topic_model', models.LsiModel.load(model_fname)) elif topic == 'lda': model_fname = '.'.join([fname, 'lda']) setattr(res, '_topic_model', models.LdaModel.load(model_fname)) setattr(res, '_content_algo', ContentBasedAlgo.load('.'.join([fname, 'content_base']))) return res
[docs] def save(self, fname, *args): ignore = ['_topic_model', '_content_algo'] if self._topic_model is not None: self._topic_model.save('.'.join([fname, self._topic])) self._content_algo.save('.'.join([fname, 'content_base'])) super().save(fname, ignore)
[docs] def _generate_item_vector(self): """ Use LDA or LSI algorithm to process TF-IDF vector and generate new item vectors. :return: DataFrame contains item id and it's new vector """ if self._topic == 'lsi': self._topic_model = models.LsiModel(corpus=self._corpus, num_topics=self._topic_n, id2word=self._id2word, chunksize=self._chunksize, power_iters=self._power_iters, extra_samples=self._extra_samples) elif self._topic == 'lda': self._topic_model = models.LdaModel(corpus=self._corpus, num_topics=self._topic_n, id2word=self._id2word, chunksize=self._chunksize, update_every=1, passes=self._passes, dtype=np.float64) else: raise ValueError(self._topic) vecs = self._topic_model[self._corpus] pure_vecs = [] for vec in vecs: if len(vec) != self._topic_n: pure_vecs.append(TopicBasedAlgo._rebuild_vector(vec, self._topic_n)) else: pure_vecs.append([v for (index, v) in vec]) return pd.DataFrame({'item_id': self._item_ids, 'vec': pure_vecs})
[docs] def to_dict(self): """ See :meth:`BaseAlgo.to_dict <base_algo.BaseAlgo.to_dict>` for more details. """ res = {'type': self._topic, 'topic_num': self._topic_n, 'chunksize': self._chunksize} if self._topic == 'lsi': res['power_iters'] = self._power_iters res['extra_samples'] = self._extra_samples else: res['passes'] = self._passes return res
[docs] @staticmethod def _rebuild_vector(partial_vector, dim): res = [0] * dim for (index, value) in partial_vector: res[index] = value return res