#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2021/5/11 20:14
# @Author  : 程婷婷
# @FileName: XgboostClassifyProcess.py
# @Software: PyCharm
import re
import os
import jieba
import joblib
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from model.base.views.data.BaseDataProcess import BaseDataProcess
from model.classify.views.logistic_classify.data.LogisticClassifyDataLoader import LogisticClassifyDataLoader


class LogisticClassifyProcess(BaseDataProcess):
    def __init__(self, config_path):
        super().__init__(config_path)
        self.lcdl = LogisticClassifyDataLoader(config_path)

    def document2sentences(self, document, key_words):
        symbols = frozenset(u"，。！？\n：；“”|）\u3000")
        out_sentences = ''
        for symbol in symbols:
            document = document.replace(symbol, '。')
        document = document.replace('\t', '').replace('\n', '')
        sentences = document.split('。')
        for sentence in sentences:
            for key in key_words:
                weight = sentence.count(key)
                sentence += '。'
                out_sentences += sentence * weight
        return out_sentences

    def filtrate_words(self, words):
        find_chinese = re.compile(u"[\u4e00-\u9fa5]+")
        symbols = "[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\！\@\#\\\&\*\%]"
        stopwords = self.lcdl.read_stopwords()
        filtrated_words = []
        for j in range(len(words)):
            if re.findall(find_chinese, words[j]) == []:
                continue
            elif re.sub(symbols, "", re.findall(find_chinese, words[j])[0]) == '':
                continue
            elif re.sub(symbols, "", re.findall(find_chinese, words[j])[0]) in stopwords:
                continue
            else:
                filtrated_words.append(re.sub(symbols, "", re.findall(find_chinese, words[j])[0]))
        return ' '.join(filtrated_words)

    def get_chi(self, data, labels):
        num = len(data)
        length = len(data[0])
        # print(type(labels[0]))
        # print(labels[0])
        print('================')
        print(len(labels))
        print(len(data))
        data_p = [data[i] for i in range(num) if int(labels[i]) == 1]
        data_n = [data[i] for i in range(num) if int(labels[i]) == 0]
        num_p = len(data_p)
        num_n = len(data_n)
        print('正样本为%s', str(num_p))
        print('负样本为%s', str(num_n))

        data_p_t = list(map(list, zip(*data_p)))
        data_n_t = list(map(list, zip(*data_n)))

        chi_square = []
        for i in range(length):
            b = data_p_t[i].count(0)
            d = data_n_t[i].count(0)
            a = num_p - b
            c = num_n - d
            if num_p * num_n * (a + c) * (b + d) == 0:
                chi_square.append(0)
            else:
                chi_square.append((num * pow(a * d - b * c, 2)) / (num_p * num_n * (a + c) * (b + d)))
        return chi_square

    def get_vocabulary_title(self, titles_tokenized_filtered, contents_tokenized_filtered, labels):

        data = [
            self.embedding_config['title_weight'] * (titles_tokenized_filtered[i] + ' ') + contents_tokenized_filtered[i]
            for i in range(len(labels))]
        cv = CountVectorizer(ngram_range=(1, 3), min_df=2)
        tf = cv.fit_transform(data)
        vocabulary_list = cv.get_feature_names()
        print(' | Train |  Title  | Vocabulary | Original Length | ' + str(len(vocabulary_list)))
        num_key_words = int(len(vocabulary_list) * self.embedding_config['title_feature_ratio'])
        print(' | Train |  Title  | Vocabulary |     Length      | ' + str(num_key_words))
        print(tf.toarray())
        tf_weights = tf.toarray().tolist()
        chi_square = self.get_chi(tf_weights, labels)
        print(' | Train |  Title  | Vocabulary | Complete by CHI ......')
        original_vocabulary_chi_square = [(vocabulary_list[i], chi_square[i]) for i in range(len(vocabulary_list))]
        sorted_original_vocabulary_chi_square = sorted(original_vocabulary_chi_square, key=lambda x: x[1], reverse=True)
        vocabulary_list = [sorted_original_vocabulary_chi_square[i][0] for i in range(num_key_words)]
        vocabulary_title = {}
        k = 0
        for word in vocabulary_list:
            vocabulary_title[word] = k
            k += 1
        return vocabulary_title

    def get_tfidf_title(self, titles_tokenized_filtered, contents_tokenized_filtered, vocabulary_title):
        data = [
            self.embedding_config['title_weight'] * (titles_tokenized_filtered[i] + ' ') + contents_tokenized_filtered[i]
            for i in range(len(self.labels))]
        cv = CountVectorizer(ngram_range=(1, 3), vocabulary=vocabulary_title)
        train_tf = cv.fit_transform(data)
        print(' | Train |  Title  | TF | Completed ......')
        tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
        train_tfidf = tfidf_transformer.fit_transform(train_tf)
        train_tfidf_weights = train_tfidf.toarray().tolist()
        print(' | Train |  Title  | TFIDF | Completed ......')
        idf = tfidf_transformer.idf_.tolist()
        return train_tfidf_weights, idf

    def get_vocabulary_content(self, contents_tokenized_filtered, labels, index):
        data = [contents_tokenized_filtered[idx] for idx in index]
        labels = [labels[idx] for idx in index]
        tf_transformer = CountVectorizer(ngram_range=(1, 3), min_df=2)
        tf = tf_transformer.fit_transform(data)
        vocabulary_list = tf_transformer.get_feature_names()
        print(' | Train | Content | Vocabulary | Original Length | ' + str(len(vocabulary_list)))
        num_key_words = int(len(vocabulary_list) * self.embedding_config['content_feature_ratio'])
        print(' | Train | Content | Vocabulary |     Length      | ' + str(num_key_words))
        tf_weights = tf.toarray().tolist()
        chi_square = self.get_chi(tf_weights, labels)
        print(' | Train | Content | Vocabulary | Complete by CHI ......')
        original_vocabulary_chi_square = [(vocabulary_list[i], chi_square[i]) for i in range(len(vocabulary_list))]
        sorted_original_vocabulary_chi_square = sorted(original_vocabulary_chi_square, key=lambda x: x[1], reverse=True)
        vocabulary_list = [sorted_original_vocabulary_chi_square[i][0] for i in range(num_key_words)]
        self.vocabulary_content = {}
        k = 0
        for word in vocabulary_list:
            self.vocabulary_content[word] = k
            k += 1
        return self.vocabulary_content

    def get_tfidf_content(self, contents_tokenized_filtered, vocabulary_content, index):
        data = [contents_tokenized_filtered[idx] for idx in index]
        tf_transformer = CountVectorizer(ngram_range=(1, 3), vocabulary=vocabulary_content)
        train_tf = tf_transformer.fit_transform(data)
        print(' | Train | Content | TF | Completed ......')
        tfidf_transformer = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
        train_tfidf = tfidf_transformer.fit_transform(train_tf)
        train_tfidf_weights = train_tfidf.toarray().tolist()
        print(' | Train | Content | TFIDF | Completed ......')
        idf = tfidf_transformer.idf_.tolist()
        return train_tfidf_weights, idf

    def title_process(self, logger):
        df = self.lcdl.read_file()
        key_words = []
        for word in list(set(df['key_words'])):
            if str(word) and str(word) != 'nan':
                key_words.append(word)
                jieba.add_word(str(word))
        df.dropna(subset=['content', 'label'], inplace=True)
        df = shuffle(df)
        df = df.reset_index(drop=True)
        all_label = list(set(df['label']))
        self.label_mapping = {v: k for k, v in dict(enumerate(all_label)).items()}
        df['label'] = df['label'].map(self.label_mapping)
        print('有用的数据共%d条' % len(df))
        logger.info('处理后的数据量为 %d 条' %len(df))
        train_set, test_set = self.split_dataset(df, use_dev=self.process_config['use_dev'])
        logger.info('训练集的数据量为 %d 条' % len(train_set))
        logger.info('测试集的数据量为 %d 条' % len(test_set))
        train_set = train_set.reset_index(drop=True)
        self.labels = train_set['label']
        train_set['content'] = [self.document2sentences(content, key_words) for content in train_set['content']]
        titles_tokenized = [jieba.lcut(sentences) for sentences in train_set['title']]
        contents_tokenized = [jieba.lcut(sentences) for sentences in train_set['content']]
        titles_tokenized_filtered = [self.filtrate_words(words) for words in titles_tokenized]
        print(' | Train | Content |  Filtered ......')
        self.contents_tokenized_filtered = [self.filtrate_words(words) for words in contents_tokenized]
        vocabulary_title = self.get_vocabulary_title(titles_tokenized_filtered,
                                                     self.contents_tokenized_filtered,
                                                     self.labels)
        # joblib.dump(vocabulary_title, filename=)
        tfidf_title, idf_title = self.get_tfidf_title(titles_tokenized_filtered,
                                                      self.contents_tokenized_filtered,
                                                      vocabulary_title)
        labels = self.labels.tolist()
        if not os.path.exists(self.embedding_config['embedding_path']):
            os.makedirs(self.embedding_config['embedding_path'])
        joblib.dump(vocabulary_title, filename=os.path.join(
            self.embedding_config['embedding_path'] ,self.embedding_config['name']+'_vocabulary_title.pkl'))
        joblib.dump(idf_title, filename=os.path.join(
            self.embedding_config['embedding_path'] ,self.embedding_config['name']+'_idf_title.pkl'))
        return tfidf_title, idf_title, labels

    def content_process(self, Index_Retain_Predict_Title):
        vocabulary_content = self.get_vocabulary_content(self.contents_tokenized_filtered,
                                                         self.labels,
                                                         Index_Retain_Predict_Title)  # feature_ratio可调节，用来控制词表的长度，防止词表过长，运行时间太长或者内存溢出。
        tfidf_content, idf_content = self.get_tfidf_content(self.contents_tokenized_filtered,
                                                            vocabulary_content,
                                                            Index_Retain_Predict_Title)
        if not os.path.exists(self.embedding_config['embedding_path']):
            os.makedirs(self.embedding_config['embedding_path'])
        joblib.dump(vocabulary_content, os.path.join(
            self.embedding_config['embedding_path'] ,self.embedding_config['name']+'_vocabulary_content.pkl'))
        joblib.dump(idf_content, os.path.join(
            self.embedding_config['embedding_path'], self.embedding_config['name'] + '_idf_content.pkl'))
        return tfidf_content, idf_content