#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File    : gj_app.py
# @Time    : 2022/12/7 19:49
# @Author  : bruxelles_li
# @Software: PyCharm

import os, json
import logging
from flask import Flask, request, jsonify
import sys
sys.path.append('../')
from 文章内容检查 import clean_html_tag
from 素材库构建程序 import *
from 文章id生成 import create_title_id
import requests, time
import queue
import pandas as pd
import traceback
from search_by_dot_matrix import get_sent_result, get_para_result
from pytime import pytime
from datetime import datetime
from pathlib import Path
from smart_extractor import extract_by_url_test
# todo: 定义缓存路径
cache_path = "测试文件"
Path(cache_path).mkdir(parents=True, exist_ok=True)

# todo: 关闭多余连接
s = requests.session()
s.keep_alive = False
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] <%(processName)s> (%(threadName)s) %('
                                               'message)s')
logger = logging.getLogger(__name__)

HOST = '0.0.0.0'
PORT = 4000
DEBUG = False
app = Flask(__name__)

# Queue基本FIFO队列  先进先出 FIFO即First in First Out,先进先出
# maxsize设置队列中，数据上限，小于或等于0则不限制，容器中大于这个数则阻塞，直到队列中的数据被消掉
q = queue.Queue(maxsize=0)

# 跨域支持1
from flask_cors import CORS

CORS(app, supports_credentials=True)


# # todo: 定义段落处理
# def para_process(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: datetime, pEndTime: datetime, pageSize: int):
#     pageNo = 10
#     para_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
#     dict_para = {
#         "text": text,
#         "para_list": para_list
#     }
#     para_result = json.dumps(dict_para)
#
#     with open(os.path.join(cache_path, "para.json"), 'w', encoding='utf-8') as file:
#         file.write(para_result)
#     time.sleep(60)                 # 设置一个 60 秒过期的缓存文件清除时间
#     os.remove(os.path.join(cache_path, "para.json"))
#     return None
#
#
# # todo: 定义句子处理
# def sent_process(text: str, contentTypeFlags: list, topicTypeNames: list, pStartTime: datetime, pEndTime: datetime, pageSize: int):
#     pageNo = 10
#     sent_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
#     dict_sent = {
#         "text": text,
#         "sent_list": sent_list
#     }
#     # todo: 将内容转换为JSON字符串用来存储
#     sent_result = json.dumps(dict_sent)
#
#     with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
#         file.write(sent_result)
#     time.sleep(60)
#     os.remove(os.path.join(cache_path, "sent.json"))
#     return None


@app.route("/", methods=["GET"])
def hello_world():
    app.logger.info('Hello World!')
    return "Hello World"


@app.route('/subject_consumer', methods=['GET', 'POST'])
def subject_consumer():
    if not q.empty():
        config_info = q.get()
        return jsonify(message='当前队列数量：' + str(q.qsize()),
                       queue_left_number=str(q.qsize()),
                       data=config_info)
    else:
        return jsonify(message='队列为空！', queue_left_number=0)


@app.route('/queue_size', methods=['GET', 'POST'])
def queue_size():
    return jsonify(queue_left_number=q.qsize())


type2iddict = {
            "speech_by_leaders": "1602095566267805697",
            "policy_document": "1602095618788880386",
            "expert_opinion": "1602095680285765633",
            "enterprise_case": "1602095727870144513",
            "other": "1602095773126684673'"
          }

type2namedict = {
            "speech_by_leaders": "领导讲话",
            "policy_document": "政策文件",
            "expert_opinion": "专家观点",
            "enterprise_case": "企业案例",
            "other": "其他"
          }


# 运行程序接口
@app.route('/build_pro', methods=["GET", "POST"])
def get_result():
        """
        -> data:
            领导讲话：1602095566267805697
            政策文件：1602095618788880386
            专家观点：1602095680285765633
            企业案例：1602095727870144513
            其他：1602095773126684673

            领导讲话：speech_by_leaders
            政策文件：policy_document
            专家观点：expert_opinion
            企业案例：enterprise_case
            其他：other
        :return:
        """
        try:
            data = request.get_json()
            # todo: 先判断是否提供url链接来获取来源，发布时间，正文内容
            if "url" in data:
                url = data["url"]
                lang_code = data["lang_code"] if "lang_code" in data else "cn"
                dict_parse = extract_by_url_test(url, lang_code)
                title = dict_parse["title"]
                content = dict_parse["content"]
                publishDate = dict_parse["publishDate"]
            else:
                title = data['title']
                ori_content = data['content']
                content = clean_html_tag(ori_content)
                publishDate = data['publishDate']

            infoId = str(data['infoId']) if 'infoId' in data else str(create_title_id())
            contentTypeFlags = data['contentTypeFlags']
            topicNames = data['topicNames']
            origin = data['origin']
            author = data['author']
            # todo: 根据typedict 获取contentType
            contentNames = type2namedict[contentTypeFlags]
            contentTypeIds = str(type2iddict[contentTypeFlags])
            # todo: 若清洗后的文章内容长度不为空，则进行处理，否则返回日志
            if len(content) >= 50:
                list_para, list_sent = build_pro_new(infoId, content, contentNames, contentTypeIds, topicNames)
                # todo： 利用dataframe对两个生成的列表内容进行去重
                df_para = pd.DataFrame(list_para)
                df_para.drop_duplicates(subset=["para_content"], keep="first", inplace=True)
                dict_para = df_para.to_dict()
                new_list_para = [dict(zip(dict_para, values)) for values in zip(*[dict_para[k].values() for k in dict_para])]
                df_sent = pd.DataFrame(list_sent)
                df_sent.drop_duplicates(subset=["sent_content"], keep="first", inplace=True)
                dict_sent = df_sent.to_dict()
                new_list_sent = [dict(zip(dict_sent, values)) for values in zip(*[dict_sent[k].values() for k in dict_sent])]
                dict_result = {
                    "code": 200,
                    "message": "success",
                    "resultData": {
                        "article_info":
                            [
                                {
                                    "infoId": infoId,
                                    "content": content,
                                    "title": title,
                                    "contentNames": contentNames,
                                    "contentTypeIds": contentTypeIds,
                                    "topicNames": topicNames,
                                    "origin": origin,
                                    "publishDate": publishDate,
                                    "author": author
                                }
                            ],
                        "para_info": new_list_para,
                        "sent_info": new_list_sent
                    }
                }
            else:
                dict_result = {
                    "code": 500,
                    "message": "failure" + "文章内容杂乱，请检查并清除杂乱格式再进行操作！",
                    "resultData": None
                }

        except Exception as e:
            dict_result = {
                'code': 500,
                'message': "failure" + str(e),
                'resultData': None
            }
        logger.info(dict_result)
        return json.dumps(dict_result, ensure_ascii=False)


@app.route('/search_content', methods=["GET", "POST"])
def get_top_content():
    try:
        # 定义接收参数
        data = request.get_json()
        text = data['queryText']
        contentTypeFlags = data['contentTypeFlags']
        topicTypeNames = data['topicNames']
        returenType = data['returenType'] if data['returenType'] else "sen"
        pageNo = int(data['pageNo'])
        pageSize = int(data['pageSize'])
        pStartTime = data['pStartTime'] if data['pStartTime'] else "2021-00-00"
        pEndTime = data['pEndTime'] if data["pEndTime"] else "2023-00-00"
        # todo: 调用搜索函数返回推荐list
        if returenType == "par":
            # todo: 先检查缓存是否可用，若不可用则重新查找
            if os.path.isfile(os.path.join(cache_path, "para.json")):
                with open(os.path.join(cache_path, "para.json"), 'r', encoding='utf-8') as f:
                    para_dict_result = json.load(f)
                # todo: 继续判断待查询的内容是否与缓存的对象相同
                if text == para_dict_result["text"]:
                    para_list = para_dict_result["para_list"]
                    pre_index = pageNo * pageSize - pageSize
                    suf_index = pageNo * pageSize
                    result_list = para_list[pre_index:suf_index]
                    dict_result = {
                        'code': 200,
                        'message': 'success',
                        'result_data': {
                            "match_info": result_list,
                            "pageNo": pageNo,
                            "pageSize": pageSize,
                            "total": len(para_list)
                        }
                    }
                    logger.info(dict_result)
                    return json.dumps(dict_result, ensure_ascii=False)
                else:
                    os.remove(os.path.join(cache_path, "para.json"))
                    result_list, len_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime,
                                                            pEndTime, pageSize, pageNo, returenType)
                    pre_index = pageNo * pageSize - pageSize
                    suf_index = pageNo * pageSize
                    dict_para = {
                        "text": text,
                        "para_list": result_list
                    }
                    para_result = json.dumps(dict_para)

                    with open(os.path.join(cache_path, "para.json"), 'w', encoding='utf-8') as file:
                        file.write(para_result)

                    dict_result = {
                        'code': 200,
                        'message': 'success',
                        'result_data': {
                            "match_info": result_list[pre_index:suf_index],
                            "pageNo": pageNo,
                            "pageSize": pageSize,
                            "total": len_list
                        }
                    }
                    # logger.info(dict_result)
                    return json.dumps(dict_result, ensure_ascii=False)
            else:
                result_list, len_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
                pre_index = pageNo * pageSize - pageSize
                suf_index = pageNo * pageSize
                dict_para = {
                    "text": text,
                    "para_list": result_list
                }
                para_result = json.dumps(dict_para)

                with open(os.path.join(cache_path, "para.json"), 'w', encoding='utf-8') as file:
                    file.write(para_result)

                dict_result = {
                    'code': 200,
                    'message': 'success',
                    'result_data': {
                        "match_info": result_list[pre_index:suf_index],
                        "pageNo": pageNo,
                        "pageSize": pageSize,
                        "total": len_list
                    }
                }
                # logger.info(dict_result)
                return json.dumps(dict_result, ensure_ascii=False)
        # todo: 处理句子
        else:
            # todo: 先检查缓存是否可用，若不可用则重新查找
            if os.path.isfile(os.path.join(cache_path, "sent.json")):
                with open(os.path.join(cache_path, "sent.json"), 'r', encoding='utf-8') as f:
                    sent_dict_result = json.load(f)
                # todo: 继续判断待查询的内容是否与缓存的对象相同
                if text == sent_dict_result["text"]:
                    sent_list = sent_dict_result["sent_list"]
                    pre_index = pageNo * pageSize - pageSize
                    suf_index = pageNo * pageSize
                    result_list = sent_list[pre_index:suf_index]
                    dict_result = {
                        'code': 200,
                        'message': 'success',
                        'result_data': {
                            "match_info": result_list,
                            "pageNo": pageNo,
                            "pageSize": pageSize,
                            "total": len(sent_list)
                        }
                    }
                    logger.info(dict_result)
                    return json.dumps(dict_result, ensure_ascii=False)
                else:
                    os.remove(os.path.join(cache_path, "sent.json"))
                    result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime,
                                                            pEndTime, pageSize, pageNo)
                    pre_index = pageNo * pageSize - pageSize
                    suf_index = pageNo * pageSize
                    dict_sent = {
                        "text": text,
                        "sent_list": result_list
                    }
                    # todo: 将内容转换为JSON字符串用来存储
                    sent_result = json.dumps(dict_sent)

                    with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
                        file.write(sent_result)
                    dict_result = {
                        'code': 200,
                        'message': 'success',
                        'result_data': {
                            "match_info": result_list[pre_index:suf_index],
                            "pageNo": pageNo,
                            "pageSize": pageSize,
                            "total": len_list
                        }
                    }
                    # logger.info(dict_result)
                    return json.dumps(dict_result, ensure_ascii=False)

            else:
                result_list, len_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
                pre_index = pageNo * pageSize - pageSize
                suf_index = pageNo * pageSize
                dict_sent = {
                    "text": text,
                    "sent_list": result_list
                }
                # todo: 将内容转换为JSON字符串用来存储
                sent_result = json.dumps(dict_sent)

                with open(os.path.join(cache_path, "sent.json"), 'w', encoding='utf-8') as file:
                    file.write(sent_result)
                dict_result = {
                    'code': 200,
                    'message': 'success',
                    'result_data': {
                        "match_info": result_list[pre_index:suf_index],
                        "pageNo": pageNo,
                        "pageSize": pageSize,
                        "total": len_list
                    }
                }
                # logger.info(dict_result)
                return json.dumps(dict_result, ensure_ascii=False)

    except Exception as e:
        traceback.print_exc()
        dic_result = {
                        'code': 500,
                        'message': "failure" + str(e),
                        'resultData': None
                      }
        logger.info(dic_result)
        return json.dumps(dic_result, ensure_ascii=False)
# @app.route('/search_content', methods=["GET", "POST"])
# def get_top_content():
#     try:
#         # 定义接收参数
#         data = request.get_json()
#         text = data['queryText']
#         contentTypeFlags = data['contentTypeFlags']
#         topicTypeNames = data['topicNames']
#         returenType = data['returenType'] if data['returenType'] else "sen"
#         pageNo = int(data['pageNo'])
#         pageSize = int(data['pageSize'])
#         pStartTime = datetime.date(pytime.parse(data['pStartTime'])) if data['pStartTime'] else pytime.parse("2020-01-01")
#         pEndTime = datetime.date(pytime.parse(data['pEndTime'])) if data["pEndTime"] else pytime.today()
#         logger.info(pStartTime, pEndTime)
#         # todo: 调用搜索函数返回推荐list
#         if returenType == "par":
#             # todo: 先检查缓存是否可用，若不可用则重新查找
#             if os.path.isfile(os.path.join(cache_path, "para.json")):
#                 with open(os.path.join(cache_path, "para.json"), 'r', encoding='utf-8') as f:
#                     para_dict_result = json.load(f)
#                 # todo: 继续判断待查询的内容是否与缓存的对象相同
#                 if text == para_dict_result["text"]:
#                     para_list = para_dict_result["para_list"]
#                     pre_index = pageNo * pageSize - pageSize
#                     suf_index = pageNo * pageSize
#                     result_list = para_list[pre_index:suf_index]
#                     dict_result = {
#                                     'code': 200,
#                                     'message': 'success',
#                                     'result_data': {
#                                         "match_info": result_list,
#                                         "pageNo": pageNo,
#                                         "pageSize": pageSize
#                                     }
#                                 }
#                     logger.info(dict_result)
#                     return json.dumps(dict_result, ensure_ascii=False)
#                 # todo： 否则进行即时查询处理
#                 else:
#                     pageNo = 1
#                     result_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime,
#                                                   pageSize, pageNo)
#                     # todo：先返回top10，然后将后续处理加入队列
#                     config_info = {
#                         "type": "par",
#                         "text": text,
#                         "contentTypeFlags": contentTypeFlags,
#                         "topicTypeNames": topicTypeNames,
#                         "pStartTime": pStartTime,
#                         "pEndTime": pEndTime,
#                         "pageSize": pageSize
#                     }
#                     q.put(config_info)
#                     dict_result = {
#                         'code': 200,
#                         'message': 'success',
#                         'result_data': {
#                             "match_info": result_list,
#                             "pageNo": pageNo,
#                             "pageSize": pageSize
#                         }
#                     }
#                     logger.info(dict_result)
#                     return json.dumps(dict_result, ensure_ascii=False)
#             # todo: 若无缓存，则进行即时查询处理
#             else:
#                 pageNo = 1
#                 result_list = get_para_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
#                 # todo：先返回top10，然后将后续处理加入队列
#                 config_info = {
#                     "type": "par",
#                     "text": text,
#                     "contentTypeFlags": contentTypeFlags,
#                     "topicTypeNames": topicTypeNames,
#                     "pStartTime": pStartTime,
#                     "pEndTime": pEndTime,
#                     "pageSize": pageSize
#                 }
#                 q.put(config_info)
#                 dict_result = {
#                     'code': 200,
#                     'message': 'success',
#                     'result_data': {
#                         "match_info": result_list,
#                         "pageNo": pageNo,
#                         "pageSize": pageSize
#                     }
#                 }
#                 logger.info(dict_result)
#                 return json.dumps(dict_result, ensure_ascii=False)
#         # todo：进入段落库查询
#         else:
#             # todo: 先检查缓存是否可用，若不可用则重新查找
#             if os.path.isfile(os.path.join(cache_path, "sent.json")):
#                 with open(os.path.join(cache_path, "sent.json"), 'r', encoding='utf-8') as f:
#                     sent_dict_result = json.load(f)
#                 # todo: 继续判断待查询的内容是否与缓存的对象相同
#                 if text == sent_dict_result["text"]:
#                     sent_list = sent_dict_result["sent_list"]
#                     pre_index = pageNo * pageSize - pageSize
#                     suf_index = pageNo * pageSize
#                     result_list = sent_list[pre_index:suf_index]
#                     dict_result = {
#                         'code': 200,
#                         'message': 'success',
#                         'result_data': {
#                             "match_info": result_list,
#                             "pageNo": pageNo,
#                             "pageSize": pageSize
#                         }
#                     }
#                     logger.info(dict_result)
#                     return json.dumps(dict_result, ensure_ascii=False)
#                 # todo： 否则进行即时查询处理
#                 pageNo = 1
#                 result_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize,
#                                               pageNo)
#                 # todo：先返回top10，然后将后续处理加入队列
#                 config_info = {
#                     "type": "sent",
#                     "text": text,
#                     "contentTypeFlags": contentTypeFlags,
#                     "topicTypeNames": topicTypeNames,
#                     "pStartTime": pStartTime,
#                     "pEndTime": pEndTime,
#                     "pageSize": pageSize
#                 }
#                 q.put(config_info)
#                 dict_result = {
#                     'code': 200,
#                     'message': 'success',
#                     'resultData': {
#                         "match_info": result_list,
#                         "pageNo": pageNo,
#                         "pageSize": pageSize
#                     }
#                 }
#                 logger.info(dict_result)
#                 return json.dumps(dict_result, ensure_ascii=False)
#             else:
#                 pageNo = 1
#                 result_list = get_sent_result(text, contentTypeFlags, topicTypeNames, pStartTime, pEndTime, pageSize, pageNo)
#                 # todo：先返回top10，然后将后续处理加入队列
#                 config_info = {
#                     "type": "sent",
#                     "text": text,
#                     "contentTypeFlags": contentTypeFlags,
#                     "topicTypeNames": topicTypeNames,
#                     "pStartTime": pStartTime,
#                     "pEndTime": pEndTime,
#                     "pageSize": pageSize
#                 }
#                 q.put(config_info)
#                 dict_result = {
#                     'code': 200,
#                     'message': 'success',
#                     'resultData': {
#                         "match_info": result_list,
#                         "pageNo": pageNo,
#                         "pageSize": pageSize
#                     }
#                 }
#                 logger.info(dict_result)
#                 return json.dumps(dict_result, ensure_ascii=False)
#
#     except Exception as e:
#         traceback.print_exc()
#         dict_result = {
#                         'code': 500,
#                         'message': "failure" + str(e),
#                         'resultData': None
#                       }
#         logger.info(dict_result)
#         return json.dumps(dict_result, ensure_ascii=False)


if __name__ == '__main__':
    app.run(host=HOST, port=PORT, debug=DEBUG)

if __name__ != '__main__':
    gunicorn_logger = logging.getLogger('gunicorn.error')
    app.logger.handlers = gunicorn_logger.handlers
    app.logger.setLevel(gunicorn_logger.level)
