#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2021/8/6 15:44
# @Author  : 程婷婷
# @FileName: cv_tfidf.py
# @Software: PyCharm
# coding:utf-8
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from model.base.views.utils import *


def cv_tfidf(corpus):
    vectorizer = CountVectorizer()  # 该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
    X = vectorizer.fit_transform(corpus)  # 将文本转为词频矩阵
    tfidf = transformer.fit_transform(X)  # 计算tf-idf，
    word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
    weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来，元素a[i][j]表示j词在i类文本中的tf-idf权重
    return word, weight

def get_word_tf_frequency(word, weight, word2count):
    word_tf, keyword, word_weight = [], [], []
    for i in range(len(weight)):  # 打印每类文本的tf-idf词语权重
        temp = list(zip(word, weight[i]))
        temp.sort(key=takeSecond, reverse=True)
        result = temp[0: 3]
        result.sort(key=takeFirst_len, reverse=True)
        for index, data in enumerate(result):
            if data[0] not in word2count:
                continue
            word_weight.append(word2count[data[0]])
            keyword.append(data[0])
            word_tf.append(data[1])
    return word_tf, keyword, word_weight
