#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File    : search_method.py
# @Time    : 2022/12/15 15:44
# @Author  : bruxelles_li
# @Software: PyCharm


"""
    pip install bert-serving-server && pip install bert-serving-client
"""
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from bert_serving.client import BertClient
from tqdm import tqdm
from numpy import *
import numpy as np
import logging
from pytime import pytime
from datetime import datetime

from 缓存处理 import memory_cache
# todo: 根据某列的属性值获取数据 -> df.loc[df['columnName'] == 'the value']
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
                                               'message)s')
logger = logging.getLogger(__name__)
type2namedict = {
            "speech_by_leaders": "领导讲话",
            "policy_document": "政策文件",
            "expert_opinion": "专家观点",
            "enterprise_case": "企业案例",
            "other": "其他"
          }
# todo: 调用bert编码服务
bc = BertClient("114.116.54.108", check_length=False)
para_prob = 0.85
sent_prob = 0.85
# todo: 定义句子和段落的矩阵文件路径
sent_file_path = "database/sent_database/句子库.npy"
para_file_path = "database/para_database/段落库.npy"
# todo：定义段落库内容
para_df = pd.read_excel('素材库/段落库/入库_para.xlsx', keep_default_na=False).astype(str)
_id2paracont = {row['id']: row['content'] for idx, row in para_df.iterrows()}
_id2para_articleid = {row['id']: row['article_id'] for idx1, row in para_df.iterrows()}
_id2paragraphid = {row['id']: row['paragraph_id'] for idx2, row in para_df.iterrows()}
_id2paraindex = {row['id']: row['para_article_index'] for idx3, row in para_df.iterrows()}
_id2para_topic_type = {row['id']: row['topic_type'] for idx4, row in para_df.iterrows()}
_id2para_content_type_name = {row['id']: row['content_type_name'] for idx5, row in para_df.iterrows()}

# todo: 定义句子库内容
sent_df = pd.read_csv('素材库/句子库/入库_sent.csv', keep_default_na=False, encoding="gbk").astype(str)
_id2sentcont = {row['id']: row['content'] for idx6, row in sent_df.iterrows()}
_id2sent_articleid = {row['id']: row['article_id'] for idx7, row in sent_df.iterrows()}
_id2sent_paraid = {row['id']: row['paragraph_id'] for idx8, row in sent_df.iterrows()}
_id2sent_paraindex = {row['id']: row['sent_para_index'] for idx9, row in sent_df.iterrows()}
_id2sent_articleindex = {row['id']: row['sent_article_index'] for idx10, row in sent_df.iterrows()}
_id2sent_topic_type = {row['id']: row['topic_type'] for idx11, row in sent_df.iterrows()}
_id2sent_content_type_name = {row['id']: row['content_type_name'] for idx12, row in sent_df.iterrows()}
_id2sent_sentid = {row['id']: row['sentence_id'] for idx18, row in sent_df.iterrows()}

# todo: 定义文章库内容，根据段落和句子所对应的文章id获取文章的基本信息 （标题、来源、发布时间、作者）
article_df = pd.read_excel("素材库/文章库/入库_article.xlsx", keep_default_na=False).astype(str)
artcile_id2title = {row['article_id']: row['article_title'] for idx13, row in article_df.iterrows()}
artcile_id2origin = {row['article_id']: row['origin'] for idx14, row in article_df.iterrows()}
artcile_id2time = {row['article_id']: row['article_time'] for idx15, row in article_df.iterrows()}
# artcile_id2author = {row['article_id']: row['author'] for idx16, row in article_df.iterrows()}
article_id2content = {row['article_id']: row['content'] for idx17, row in article_df.iterrows()}


def get_para_result(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: datetime.date, pEndTime: datetime.date, pageSize: int, pageNo: int, returenType: str):
    # todo: 根据字典获取contentTypeName
    contentTypeName_list = []
    if contentTypeFlags:
        for type in contentTypeFlags:
            content_type_name = type2namedict[type]
            contentTypeName_list.append(content_type_name)
    # 导入初始矩阵
    b = np.load(para_file_path)
    # todo: 将初始矩阵转换为目标矩阵，通过先转置，后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
    c = b.transpose()
    d = c[1::].transpose()
    # todo: 此时，id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
    id_list = c[0].tolist()
    # todo: 将矩阵索引与id_list通过定义id_dict关联
    # 根据行长度初始化矩阵索引np_list
    np_list = [n for n in range(b.shape[0])]
    id_dict = dict(zip(np_list, id_list))
    a = bc.encode([text])
    r = cosine_similarity(a, d)
    list_index = []
    result = []
    for i in range(r.shape[0]):
        for j in range(r.shape[1]):
            sim_value = r[i][j]
            if sim_value >= para_prob:
                list_index.append({
                    'sim': sim_value,
                    '_id': str(id_dict[j]).split(".")[0]
                })
    name = ['_id', 'sim']
    df = pd.DataFrame(columns=name, data=list(list_index))
    test = df.sort_values(by=['sim'], axis=0, ascending=False)
    # todo: 根据pagesize 和 pageno 获取内容长度
    # pre_index = pageNo * pageSize - pageSize
    suf_index = pageNo * pageSize
    # todo: 场景4 ->都不勾选
    if len(contentTypeName_list) == 0 and len(topicTypeNames) == 0:
        df1 = test[:suf_index]
    # todo： 场景1 ->勾选类型参数和主题参数 场景2 ->仅勾选类型参数 场景3 ->仅勾选主题参数
    else:
        df1 = test[:4 * suf_index]
    for idx, row in tqdm(df1.iterrows()):
        _id = row['_id']
        # todo: 根据唯一标识id获取段落信息
        para_content = _id2paracont.get(_id)
        paragraphid = _id2paragraphid.get(_id)
        paraindex = _id2paraindex.get(_id)
        para_topic_type = _id2para_topic_type.get(_id)
        para_content_type_name = _id2para_content_type_name.get(_id)
        para_article_id = _id2para_articleid.get(_id)
        # todo： 根据段落所在的文章id获取文章信息
        title = artcile_id2title.get(para_article_id)
        origin = artcile_id2origin.get(para_article_id)
        publishDate = artcile_id2time.get(para_article_id)
        # todo: 将时间转换为统一的格式
        time = datetime.date(pytime.parse(artcile_id2time.get(para_article_id)))
        # author = artcile_id2author.get(para_article_id)
        article_content = article_id2content.get(para_article_id)
        # todo： 场景1 ->勾选类型参数和主题参数
        if contentTypeName_list and topicTypeNames:
            if para_content_type_name in contentTypeName_list and para_topic_type in topicTypeNames \
                    and pStartTime <= time <= pEndTime:
                result.append({
                    "content":  "<font style='color:red;'>" + para_content + "</font>",
                    "similarity": round(row['sim'], 4),
                    "id": _id,
                    "article_id": para_article_id,
                    "paragraphid": paragraphid,
                    "paraindex": paraindex,
                    "para_topic_type": para_topic_type,
                    "para_content_type_name": para_content_type_name,
                    "article_content": article_content,
                    "publishDate": publishDate,
                    "author": "",
                    "origin": origin,
                    "title": title
                })
        # todo: 场景2 ->仅勾选类型参数
        elif contentTypeName_list and len(topicTypeNames) == 0:
            if para_content_type_name in contentTypeName_list and pStartTime <= time <= pEndTime:
                result.append({
                    "content":  "<font style='color:red;'>" + para_content + "</font>",
                    "similarity": round(row['sim'], 4),
                    "id": _id,
                    "article_id": para_article_id,
                    "paragraphid": paragraphid,
                    "paraindex": paraindex,
                    "para_topic_type": para_topic_type,
                    "para_content_type_name": para_content_type_name,
                    "article_content": article_content,
                    "publishDate": publishDate,
                    "author": "",
                    "origin": origin,
                    "title": title
                })
        # todo: 场景3 ->仅勾选主题参数
        elif len(contentTypeName_list) == 0 and topicTypeNames:
            if para_topic_type in topicTypeNames and pStartTime <= time <= pEndTime:
                result.append({
                    "content": "<font style='color:red;'>" + para_content + "</font>",
                    "similarity": round(row['sim'], 4),
                    "id": _id,
                    "article_id": para_article_id,
                    "paragraphid": paragraphid,
                    "paraindex": paraindex,
                    "para_topic_type": para_topic_type,
                    "para_content_type_name": para_content_type_name,
                    "article_content": article_content,
                    "publishDate": publishDate,
                    "author": "",
                    "origin": origin,
                    "title": title
                })
        else:
            if pStartTime <= time <= pEndTime:
                result.append({
                    "content": "<font style='color:red;'>" + para_content + "</font>",
                    "similarity": round(row['sim'], 4),
                    "id": _id,
                    "article_id": para_article_id,
                    "paragraphid": paragraphid,
                    "paraindex": paraindex,
                    "para_topic_type": para_topic_type,
                    "para_content_type_name": para_content_type_name,
                    "article_content": article_content,
                    "publishDate": publishDate,
                    "author": "",
                    "origin": origin,
                    "title": title
                })
    # print(result)
    df2 = pd.DataFrame(result, columns=["content", "similarity", "id", "article_id", "paragraphid", "paraindex", "para_topic_type", "para_content_type_name",
                                        "article_content", "publishDate", "author", "origin", "title"])
    df2.drop_duplicates(subset=["content"], keep="first", inplace=True)
    # todo: 将df 转为list
    final_dict = df2.to_dict()
    result_list = [dict(zip(final_dict, values)) for values in zip(*[final_dict[k].values() for k in final_dict])]
    # memory_cache.set_value(text, result_list[:100], 60)  # 设置一个 60 秒过期的键值对
    # df2.to_excel('测试文件/段落库测试.xlsx', index=False, engine="xlsxwriter")
    result_list = result_list[:100] if len(result_list) >= 100 else result_list
    return result_list


def get_sent_result(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: datetime.date, pEndTime: datetime.date, pageSize: int, pageNo: int):
    # todo: 根据字典获取contentTypeName
    contentTypeName_list = []
    if contentTypeFlags:
        for type in contentTypeFlags:
            content_type_name = type2namedict[type]
            contentTypeName_list.append(content_type_name)
    # 导入初始矩阵
    b = np.load(sent_file_path)
    # todo: 将初始矩阵转换为目标矩阵，通过先转置，后按行切片获得目标子矩阵,然后对子矩阵再次转置得到
    c = b.transpose()
    d = c[1::].transpose()
    # todo: 此时，id_list(对应从0-N的矩阵索引)可根据第一次转置后的第一行获得
    id_list = c[0].tolist()
    # todo: 将矩阵索引与id_list通过定义id_dict关联
    # 根据行长度初始化矩阵索引np_list
    np_list = [n for n in range(b.shape[0])]
    id_dict = dict(zip(np_list, id_list))
    a = bc.encode([text])
    r = cosine_similarity(a, d)
    list_index = []
    result = []
    for i in range(r.shape[0]):
        for j in range(r.shape[1]):
            sim_value = r[i][j]
            if sim_value >= sent_prob:
                list_index.append({
                    'sim': sim_value,
                    '_id': str(id_dict[j]).split(".")[0]
                })
    name = ['_id', 'sim']
    df = pd.DataFrame(columns=name, data=list(list_index))
    test = df.sort_values(by=['sim'], axis=0, ascending=False)
    # todo: 根据pagesize 和 pageno 获取内容长度
    # pre_index = pageNo * pageSize - pageSize
    suf_index = pageNo * pageSize
    # todo: 场景4 ->都不勾选
    if len(contentTypeName_list) == 0 and len(topicTypeNames) == 0:
        df1 = test[:suf_index]
    # todo： 场景1 ->勾选类型参数和主题参数 场景2 ->仅勾选类型参数 场景3 ->仅勾选主题参数
    else:
        df1 = test[:4 * suf_index]
    for idx, row in tqdm(df1.iterrows()):
        _id = row['_id']
        # todo: 根据唯一标识id获取句子信息
        sent_content = _id2sentcont.get(_id)
        paragraph_id = _id2sent_paraid.get(_id)
        sent_para_index = _id2sent_paraindex.get(_id)
        sent_article_index = _id2sent_articleindex.get(_id)
        sent_topic_type = _id2sent_topic_type.get(_id)
        sent_content_type_name = _id2sent_content_type_name.get(_id)
        sent_article_id = _id2sent_articleid.get(_id)
        sentence_id = _id2sent_sentid.get(_id)
        # todo： 根据段落所在的文章id获取文章信息
        title = artcile_id2title.get(sent_article_id)
        origin = artcile_id2origin.get(sent_article_id)
        publishDate = artcile_id2time.get(sent_article_id)
        # todo: 将日期转换为统一的格式
        time = datetime.date(pytime.parse(artcile_id2time.get(sent_article_id)))
        # author = artcile_id2author.get(para_article_id)
        article_content = article_id2content.get(sent_article_id)
        # todo: 根据sentence_id 和 sent_article_id 获取前后句
        a = sent_df.loc[(sent_df['article_id'] == sent_article_id) & (sent_df['sentence_id'] == str(int(sentence_id) - 1))]
        if a.empty:
            pre_sent = ""
        else:
            dict_pre = a.to_dict()
            new_dict_pre = [dict(zip(dict_pre, values)) for values in zip(*[dict_pre[k].values() for k in dict_pre])]
            pre_sent = new_dict_pre[0]["content"]

        b = sent_df.loc[(sent_df["article_id"] == sent_article_id) & (sent_df["sentence_id"] == str(int(sentence_id) + 1))]
        if b.empty:
            suf_sent = ""
        else:
            dict_suf = b.to_dict()
            new_dict_suf = [dict(zip(dict_suf, values)) for values in zip(*[dict_suf[k].values() for k in dict_suf])]
            suf_sent = new_dict_suf[0]["content"]

        # todo： 场景1 ->勾选类型参数和主题参数
        if contentTypeName_list and topicTypeNames:
            if sent_content_type_name in contentTypeName_list and sent_topic_type in topicTypeNames \
                    and pStartTime <= time <= pEndTime:
                result.append({
                    "content":  pre_sent + "<font style='color:red;'>" + sent_content + "</font>" + suf_sent,
                    "similarity": round(row['sim'], 4),
                    "id": _id,
                    "article_id": sent_article_id,
                    "paragraphid": paragraph_id,
                    "sent_para_index": sent_para_index,
                    "sent_article_index": sent_article_index,
                    "sent_topic_type": sent_topic_type,
                    "sent_content_type_name": sent_content_type_name,
                    "article_content": article_content,
                    "publishDate": publishDate,
                    "author": "",
                    "origin": origin,
                    "title": title
                })
        # todo: 场景2 ->仅勾选类型参数
        elif contentTypeName_list and len(topicTypeNames) == 0:
            if sent_content_type_name in contentTypeName_list and pStartTime <= time <= pEndTime:
                result.append({
                     "content":  pre_sent + "<font style='color:red;'>" + sent_content + "</font>" + suf_sent,
                    "similarity": round(row['sim'], 4),
                    "id": _id,
                    "article_id": sent_article_id,
                    "paragraphid": paragraph_id,
                    "sent_para_index": sent_para_index,
                    "sent_article_index": sent_article_index,
                    "sent_topic_type": sent_topic_type,
                    "sent_content_type_name": sent_content_type_name,
                    "article_content": article_content,
                    "publishDate": publishDate,
                    "author": "",
                    "origin": origin,
                    "title": title
                })
        # todo: 场景3 ->仅勾选主题参数
        elif len(contentTypeName_list) == 0 and topicTypeNames:
            if sent_topic_type in topicTypeNames and pStartTime <= time <= pEndTime:
                result.append({
                     "content":  pre_sent + "<font style='color:red;'>" + sent_content + "</font>" + suf_sent,
                    "similarity": round(row['sim'], 4),
                    "id": _id,
                    "article_id": sent_article_id,
                    "paragraphid": paragraph_id,
                    "sent_para_index": sent_para_index,
                    "sent_article_index": sent_article_index,
                    "sent_topic_type": sent_topic_type,
                    "sent_content_type_name": sent_content_type_name,
                    "article_content": article_content,
                    "publishDate": publishDate,
                    "author": "",
                    "origin": origin,
                    "title": title
                })
        else:
            if pStartTime <= time <= pEndTime:
                result.append({
                    "content":  pre_sent + "<font style='color:red;'>" + sent_content + "</font>" + suf_sent,
                    "similarity": round(row['sim'], 4),
                    "id": _id,
                    "article_id": sent_article_id,
                    "paragraphid": paragraph_id,
                    "sent_para_index": sent_para_index,
                    "sent_article_index": sent_article_index,
                    "sent_topic_type": sent_topic_type,
                    "sent_content_type_name": sent_content_type_name,
                    "article_content": article_content,
                    "publishDate": publishDate,
                    "author": "",
                    "origin": origin,
                    "title": title
                })
    # print(result)
    df2 = pd.DataFrame(result, columns=["content", "similarity", "id", "article_id", "paragraphid", "sent_para_index",
                                        "sent_article_index", "sent_topic_type", "sent_content_type_name",
                                        "article_content", "publishDate", "author", "origin", "title"])
    df2.drop_duplicates(subset=["content"], keep="first", inplace=True)
    # todo: 将df 转为list
    final_dict = df2.to_dict()
    result_list = [dict(zip(final_dict, values)) for values in zip(*[final_dict[k].values() for k in final_dict])]
    # memory_cache.set_value(text, result_list[:100], 60)  # 设置一个 60 秒过期的键值对
    # df2.to_excel('测试文件/段落库测试.xlsx', index=False, engine="xlsxwriter")
    result_list = result_list[:100] if len(result_list) >= 100 else result_list
    return result_list


if __name__ == "__main__":
    contentTypeFlags = ["speech_by_leaders"]
    topicTypeNames = ["共同富裕"]
    pStartTime = pytime.parse("2021-06-08")
    pEndTime = pytime.parse("2022-12-13")
    text = "新时代共同富裕的宗旨有"
    pageNo = 1
    pageSize = 10
    result_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
    print(result_list)
    cache_list = memory_cache.get_value(text)
    if cache_list:
        result_two = cache_list[10:20]
        print(result_two)
        result_three = cache_list[20:30]
        print(result_three)
    else:
        print("查询已失效，请重新查询！")








