#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File    : main_app.py
# @Time    : 2022/12/14 19:49
# @Author  : bruxelles_li
# @Software: PyCharm

import os, json
import logging
from flask import Flask, request, jsonify
import sys
sys.path.append('../')
from 文章内容检查 import clean_html_tag
from 素材库构建程序 import *
from 文章id生成 import create_title_id
import requests
import queue
import pandas as pd
import time
from smart_extractor import extract_by_url_test
import traceback
from pathlib import Path
from tqdm import tqdm
from search_by_dot_matrix import get_sent_result, get_para_result, get_sen_duplicated, get_para_duplicated, put_para_list, put_sen_list

# 关闭多余连接
s = requests.session()
s.keep_alive = False
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
                                               'message)s')
logger = logging.getLogger(__name__)

HOST = '0.0.0.0'
PORT = 4002
DEBUG = False
app = Flask(__name__)
# todo: 定义缓存路径
cache_path = "测试文件"
Path(cache_path).mkdir(parents=True, exist_ok=True)
# Queue基本FIFO队列  先进先出 FIFO即First in First Out,先进先出
# maxsize设置队列中，数据上限，小于或等于0则不限制，容器中大于这个数则阻塞，直到队列中的数据被消掉
q = queue.Queue(maxsize=0)
# todo: 定义文章内容
# df0 = pd.read_excel('素材库/文章库/入库_article.xlsx').astype(str)
# art_list = df0["content"].tolist()
# content2_id = {row['content']: row['id'] for idx, row in df0.iterrows()}

# 跨域支持1
from flask_cors import CORS

CORS(app, supports_credentials=True)


@app.route("/", methods=["GET"])
def hello_world():
    logger.info('Hello World!')
    return "Hello World"


type2iddict = {
            "speech_by_leaders": "1602095566267805697",
            "policy_document": "1602095618788880386",
            "expert_opinion": "1602095680285765633",
            "enterprise_case": "1602095727870144513",
            "other": "1602095773126684673",
            # "think_tanks": "",
            # "policies_regulations": "",
            # "enterprise_news": "",
          }

type2namedict = {
            "speech_by_leaders": "领导讲话",
            "policy_document": "政策文件",
            "expert_opinion": "专家观点",
            "enterprise_case": "企业案例",
            "other": "其他",
            # "think_tanks": "智库",
            # "policies_regulations": "政策法规",
            # "enterprise_news": "企业资讯",
          }


# 运行程序接口
@app.route('/build_pro', methods=["POST"])
def get_result():
        """
        -> data:
            领导讲话：1602095566267805697
            政策文件：1602095618788880386
            专家观点：1602095680285765633
            企业案例：1602095727870144513
            其他：1602095773126684673

            领导讲话：speech_by_leaders
            政策文件：policy_document
            专家观点：expert_opinion
            企业案例：enterprise_case
            其他：other
        :return:
        """
        try:
            data = request.get_json()
            # todo: 先判断是否提供url链接来获取来源，发布时间，正文内容
            if "url" in data:
                url = data["url"]
                lang_code = data["lang_code"] if "lang_code" in data else "cn"
                dict_parse = extract_by_url_test(url, lang_code)
                title = dict_parse["title"]
                ori_content = dict_parse["content"]
                content = clean_html_tag(ori_content)
                publishDate = dict_parse["publishDate"]
            else:
                title = data['title']
                ori_content = data['content']
                content = clean_html_tag(ori_content)
                publishDate = data['publishDate']

            infoId = str(data['infoId']) if data["infoId"] else str(create_title_id())
            contentTypeFlags = data['contentTypeFlags']
            topicNames = data['topicNames']
            origin = data['origin']
            author = data['author']
            # todo: 根据typedict 获取contentType
            contentNames = type2namedict[contentTypeFlags]
            contentTypeIds = str(type2iddict[contentTypeFlags])
            # todo: 若清洗后的文章内容长度不为空，则进行处理，否则返回日志
            if len(content) >= 50:
                list_para, list_sent = build_pro_new(infoId, content, contentNames, contentTypeIds, topicNames)
                # todo： 利用dataframe对两个生成的列表内容进行去重
                df_para = pd.DataFrame(list_para)
                df_para.drop_duplicates(subset=["para_content"], keep="first", inplace=True)
                dict_para = df_para.to_dict()
                new_list_para = [dict(zip(dict_para, values)) for values in zip(*[dict_para[k].values() for k in dict_para])]
                df_sent = pd.DataFrame(list_sent)
                df_sent.drop_duplicates(subset=["sent_content"], keep="first", inplace=True)
                dict_sent = df_sent.to_dict()
                new_list_sent = [dict(zip(dict_sent, values)) for values in zip(*[dict_sent[k].values() for k in dict_sent])]
                # todo： 新增素材库去重，更新repeatedId, is_main, 唯一标识id， create_time
                # final_list_para = get_para_duplicated(new_list_para)
                # final_list_sent = get_sen_duplicated(new_list_sent)
                # # todo: 根据文章内容判断文章是否重复
                # if content in art_list:
                #     repeatedId = content2_id.get(content)
                #     is_main = "0"
                # else:
                #     repeatedId = ""
                #     is_main = ""

                dict_result = {
                    "code": 200,
                    "message": "success",
                    "resultData": {
                        "article_info":
                            [
                                {
                                    "repeatedId": "",
                                    "is_main": "",
                                    "create_time": time.strftime("%Y-%m-%d %H:%M:%S"),
                                    "infoId": infoId,
                                    "content": content,
                                    "title": title,
                                    "contentNames": contentNames,
                                    "contentTypeIds": contentTypeIds,
                                    "topicNames": topicNames,
                                    "origin": origin,
                                    "publishDate": publishDate,
                                    "author": author,
                                    "type": "art"
                                }
                            ],
                        "para_info": new_list_para,
                        "sent_info": new_list_sent
                    }
                }
            else:
                dict_result = {
                    "code": 500,
                    "message": "failure" + "文章内容杂乱，请检查并清除杂乱格式再进行操作！",
                    "resultData": None
                }

        except Exception as e:
            dict_result = {
                'code': 500,
                'message': "failure" + str(e),
                'resultData': None
            }
        logger.info(dict_result)
        return json.dumps(dict_result, ensure_ascii=False)


@app.route('/put_database', methods=["POST"])
def put_result():
        try:
            data = request.get_json()
            para_list = data["resultData"]["para_info"]
            sen_list = data["resultData"]["sent_info"]
            # todo: 先判断是否提供url链接来获取来源，发布时间，正文内容
            sen_flag = put_sen_list(sen_list)
            para_flag = put_para_list(para_list)
            if sen_flag == "1" and para_flag == "1":
                dict_result = {
                    "code": 200,
                    "message": "success",
                    "resultData": "已成功处理"
                }
                logger.info(dict_result)
                return json.dumps(dict_result, ensure_ascii=False)

        except Exception as e:
            dict_result = {
                'code': 500,
                'message': "failure" + str(e),
                'resultData': None
            }
            logger.info(dict_result)
            return json.dumps(dict_result, ensure_ascii=False)


@app.route('/search_content', methods=["POST"])
def get_top_content():
    try:
        # 定义接收参数
        data = request.get_json()
        # logger.info(data)
        text = data['queryText']
        contentTypeFlags = data['contentTypeFlags'] if "contentTypeFlags" in data else []
        topicTypeNames = data['topicNames'] if 'topicNames' in data else []
        returenType = data['returenType'] if data['returenType'] else "sen"
        pageNo = int(data['pageNo'])
        pageSize = int(data['pageSize'])
        pStartTime = data['pStartTime'] if data['pStartTime'] else "2021-00-00"
        pEndTime = data['pEndTime'] if data["pEndTime"] else "2023-00-00"
        # todo: 根据字典获取contentTypeName
        contentTypeName_list = []
        if contentTypeFlags:
            for type in contentTypeFlags:
                content_type_name = type2namedict[type]
                contentTypeName_list.append(content_type_name)
        else:
            contentTypeName_list = ["领导讲话", "专家观点", "政策文件", "企业案例", "其他"]
        # todo: 调用搜索函数返回推荐list
        if returenType == "par":
            # todo: 先检查缓存是否可用，若不可用则重新查找
            if os.path.isfile(os.path.join(cache_path, "para.json")):
                with open(os.path.join(cache_path, "para.json"), 'r', encoding='utf-8') as f:
                    para_dict_result = json.load(f)
                # todo: 继续判断待查询的内容是否与缓存的对象相同
                if text == para_dict_result["text"]:
                    final_para_list = []
                    para_list = para_dict_result["para_list"]
                    for row in tqdm(para_list):
                        if row["content_type_name"] in contentTypeName_list:
                            final_para_list.append(row)
                        else:
                            continue
                    pre_index = pageNo * pageSize - pageSize
                    suf_index = pageNo * pageSize
                    result_list = final_para_list[pre_index:suf_index]
                    dict_result = {
                        'code': 200,
                        'message': 'success',
                        'result_data': {
                            "match_info": result_list,
                            "pageNo": pageNo,
                            "pageSize": pageSize,
                            "total": len(result_list)
                        }
                    }
                    # logger.info(dict_result)
                    return json.dumps(dict_result, ensure_ascii=False)
                else:
                    os.remove(os.path.join(cache_path, "para.json"))
                    result_list, len_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime,
                                                            pEndTime, pageSize, pageNo, returenType)
                    pre_index = pageNo * pageSize - pageSize
                    suf_index = pageNo * pageSize
                    dict_para = {
                        "text": text,
                        "para_list": result_list
                    }
                    para_result = json.dumps(dict_para)

                    with open(os.path.join(cache_path, "para.json"), 'w', encoding='utf-8') as file:
                        file.write(para_result)

                    dict_result = {
                        'code': 200,
                        'message': 'success',
                        'result_data': {
                            "match_info": result_list[pre_index:suf_index],
                            "pageNo": pageNo,
                            "pageSize": pageSize,
                            "total": len_list
                        }
                    }
                    # logger.info(dict_result)
                    return json.dumps(dict_result, ensure_ascii=False)
            else:
                result_list, len_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo, returenType)
                pre_index = pageNo * pageSize - pageSize
                suf_index = pageNo * pageSize
                dict_para = {
                    "text": text,
                    "para_list": result_list
                }
                para_result = json.dumps(dict_para)

                with open(os.path.join(cache_path, "para.json"), 'w', encoding='utf-8') as file:
                    file.write(para_result)

                dict_result = {
                    'code': 200,
                    'message': 'success',
                    'result_data': {
                        "match_info": result_list[pre_index:suf_index],
                        "pageNo": pageNo,
                        "pageSize": pageSize,
                        "total": len_list
                    }
                }
                # logger.info(dict_result)
                return json.dumps(dict_result, ensure_ascii=False)
        # todo: 处理句子
        else:
            # todo: 先检查缓存是否可用，若不可用则重新查找
            if os.path.isfile(os.path.join(cache_path, "sent.json")):
                with open(os.path.join(cache_path, "sent.json"), 'r', encoding='utf-8') as f:
                    sent_dict_result = json.load(f)
                # todo: 继续判断待查询的内容是否与缓存的对象相同
                if text == sent_dict_result["text"]:
                    sent_list = sent_dict_result["sent_list"]
                    final_sent_list = []
                    for row in tqdm(sent_list):
                        if row["content_type_name"] in contentTypeName_list:
                            final_sent_list.append(row)
                        else:
                            continue
                    pre_index = pageNo * pageSize - pageSize
                    suf_index = pageNo * pageSize
                    result_list = final_sent_list[pre_index:suf_index]
                    dict_result = {
                        'code': 200,
                        'message': 'success',
                        'result_data': {
                            "match_info": result_list,
                            "pageNo": pageNo,
                            "pageSize": pageSize,
                            "total": len(final_sent_list)
                        }
                    }
                    # logger.info(dict_result)
                    return json.dumps(dict_result, ensure_ascii=False)
                else:
                    os.remove(os.path.join(cache_path, "sent.json"))
                    result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime,
                                                            pEndTime, pageSize, pageNo, returenType)
                    logger.info(result_list)
                    pre_index = pageNo * pageSize - pageSize
                    suf_index = pageNo * pageSize
                    dict_sent = {
                        "text": text,
                        "sent_list": result_list
                    }
                    # todo: 将内容转换为JSON字符串用来存储
                    sent_result = json.dumps(dict_sent)

                    with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
                        file.write(sent_result)
                    dict_result = {
                        'code': 200,
                        'message': 'success',
                        'result_data': {
                            "match_info": result_list[pre_index:suf_index],
                            "pageNo": pageNo,
                            "pageSize": pageSize,
                            "total": len_list
                        }
                    }
                    # logger.info(dict_result)
                    return json.dumps(dict_result, ensure_ascii=False)

            else:
                result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo, returenType)
                logger.info(result_list)
                pre_index = pageNo * pageSize - pageSize
                suf_index = pageNo * pageSize
                dict_sent = {
                    "text": text,
                    "sent_list": result_list
                }
                # todo: 将内容转换为JSON字符串用来存储
                sent_result = json.dumps(dict_sent)

                with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
                    file.write(sent_result)
                dict_result = {
                    'code': 200,
                    'message': 'success',
                    'result_data': {
                        "match_info": result_list[pre_index:suf_index],
                        "pageNo": pageNo,
                        "pageSize": pageSize,
                        "total": len_list
                    }
                }
                # logger.info(dict_result)
                return json.dumps(dict_result, ensure_ascii=False)

    except Exception as e:
        traceback.print_exc()
        dic_result = {
                        'code': 500,
                        'message': "failure" + str(e),
                        'resultData': None
                      }
        logger.info(dic_result)
        return json.dumps(dic_result, ensure_ascii=False)


if __name__ == '__main__':
    app.run(host=HOST, port=PORT, debug=DEBUG)

if __name__ != '__main__':
    gunicorn_logger = logging.getLogger('gunicorn.error')
    app.logger.handlers = gunicorn_logger.handlers
    app.logger.setLevel(gunicorn_logger.level)
