#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2021/5/11 20:14
# @Author  : 程婷婷
# @FileName: XgboostClassifyProcess.py
# @Software: PyCharm
import numpy as np
import re
import time
from model.base.views.data.BaseDataProcess import BaseDataProcess

class KmeansProcess(BaseDataProcess):
    def __init__(self, config_path):
        super().__init__(config_path)

    def remove_char(self, content):
        # 保留中文、英语字母、数字和标点
        graph_filter = re.compile(r'[^\u4e00-\u9fa5，。\.,？\?!！；;]')
        content = graph_filter.sub('', content)
        return content

    def process(self, data, min_content):
        processed_data = []
        i = 0
        for record in data:
            record = self.remove_char(record)
            if len(record) > min_content:
                methods = self.process_config['tokenizer']
                if methods == 'PerceptronLexicalAnalyzer':
                    record = self.pla_tokenizer(record)
                else:
                    record = self.jieba_tokenizer(record)
                processed_data.append(record)
                i += 1
            else:
                i += 1
                pass
            if (i+1)%100 == 0 or i+1 == len(data):
                print(time.strftime('%Y-%m-%d %H:%M:%S'),'第',i+1,'条文本分词完毕')
        return processed_data

    def runner_process(self, data, labels):
        # all_label = list(set(labels))
        # label_mapping = {v: k for k, v in dict(enumerate(all_label)).items()}
        processed_data = self.process(data, min_content=10)
        transformed_data1, feature_words = self.bag_of_words(processed_data, labels)
        processed_data2 = []
        for i in processed_data:
            record = i.split(' ')
            processed_data2.append(record)
        transformed_data2 = self.word2vec(processed_data2, feature_words=feature_words)
        transformed_data = np.dot(transformed_data1, transformed_data2)
        return transformed_data

# import pandas as pd
# df = pd.read_excel(r'E:\working\model_train\KMeans\data\test.xlsx')
# kp = KmeansProcess()
# kp.runner_process()
