Source code for ucas_dm.preprocess.preprocess

# coding=utf-8
import pandas as pd
import jieba.posseg as pseg
import codecs
import os
from gensim import corpora, models
from ast import literal_eval


[docs]class PreProcessor:

    def __init__(self, source_data_path):
        self.__source_data_path = source_data_path

[docs]    def extract_news(self):
        """
        This method extract news from data and save them to a csv file.

        :return: A pandas.DataFrame with two attributes: news_id and content
        """
        data = pd.read_csv(filepath_or_buffer=self.__source_data_path, sep="\\t", names=[
            'user_id', 'news_id', 'view_time', 'title', 'content', 'publish_time'], encoding="utf-8")
        data = data[['news_id', 'title', 'content']]
        data = data.fillna('')
        data = data.drop_duplicates().reset_index(drop=True)
        data['content'] = data['title'] + data['content']
        id_content = data[['news_id', 'content']]
        return id_content

[docs]    @classmethod
    def generate_tokens(cls, id_content):
        """
        This method generate tokens for news.

        :param id_content: A pandas.DataFrame of news id(integer) and its content(string)  \
        \|column1: news_id\|column2: content\|

        :return: A pd.DataFrame of news id and its tokens
        """
        dir_path = os.path.split(__file__)[0]
        stop_words_path = dir_path + "/stop_words/stop.txt"
        id_content.columns = ['news_id', 'content']
        stop_words = codecs.open(stop_words_path, encoding='utf8').readlines()
        stop_words = [w.strip() for w in stop_words]
        stop_flags = ['x', 'c', 'u', 'd', 'p', 't', 'uj', 'm', 'f', 'r']

        def _tokenization(text):
            result = []
            words = pseg.cut(text)
            for word, flag in words:
                if flag not in stop_flags and word not in stop_words:
                    result.append(word)
            return result

        res = []
        for i in range(id_content.shape[0]):
            content = id_content.loc[i, 'content']
            result = _tokenization(content)
            res.append(result)

        assert (id_content.shape[0] == len(
            res)), "The number of id_content's rows doesn't match the length of tokenization result."
        id_tokens = pd.DataFrame({'news_id': id_content['news_id'], 'tokens': res})
        return id_tokens

[docs]    @classmethod
    def build_tf_idf(cls, id_tokens):
        """
        This method builds TF-IDF vectors for news.

        :param id_tokens: A pandas.DataFrame contains news id and its tokens.  \|column1: news_id\|column2: tokens\|
        :return: A dict - {"id_tfvec": A pandas.DataFrame contains news id and its tf-idf vector \
        \|column1: news_id\|column2: tf_vec\| ,"gensim_pack":{"word2dict": important parameter if package "gensim" is \
        used for further process, "corpus": important parameter if package "gensim" is used for further process}}
        """
        id_tokens.columns = ['news_id', 'tokens']
        pure_tokens = id_tokens['tokens'].values.tolist()
        if isinstance(pure_tokens[0], str):
            pure_tokens = [literal_eval(t) for t in pure_tokens]  # transform list-like strings to list
        word_dict = corpora.Dictionary(pure_tokens)  # Used in LSA or LDA algorithm
        news_bow = [word_dict.doc2bow(t) for t in pure_tokens]
        algo = models.TfidfModel(news_bow)
        corpus_tfidf = algo[news_bow]
        news_vec = []
        for t in corpus_tfidf:
            news_vec.append([v for (_, v) in t])
        id_tfvec = pd.DataFrame({'news_id': id_tokens['news_id'], 'tf_vec': news_vec})
        return {"id_tfvec": id_tfvec, "gensim_pack": {"id2word": word_dict, "corpus": corpus_tfidf}}

[docs]    def extract_view_log(self):
        """
        This method extract user view log from data and save it to a csv file.

        :return: A pandas.DataFrame with 3 attributes: user_id, news_id, view_time
        """
        data = pd.read_csv(filepath_or_buffer=self.__source_data_path, sep="\\t", names=[
            'user_id', 'news_id', 'view_time', 'title', 'content', 'publish_time'], encoding="utf-8")
        user_log = data[['user_id', 'news_id', 'view_time']]
        user_log['view_time'] = pd.to_datetime(user_log['view_time'], unit='s')
        user_log = user_log.drop_duplicates().reset_index(drop=True)
        return user_log