import json
import re
import threading
import time
import uuid

import pymongo
import redis
import requests
from bs4 import BeautifulSoup
from retry import retry
from elasticsearch import Elasticsearch
from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore(sqlFlg=False)
log = baseCore.getLogger()

db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017/', username='admin', password='ZZsn@9988').ZZSN[
    '全球企业资讯0710']

lock = threading.Lock()


class EsMethod(object):

    def __init__(self):
        # 创建Elasticsearch对象，并提供账号信息
        self.es = Elasticsearch(['http://114.116.19.92:9700'], http_auth=('elastic', 'zzsn9988'), timeout=300)
        self.index_name = 'subjectdatabase'

    def queryatt(self,index_name,pnum):
       body = {
              "query": {
                "bool": {
                  "must": [
                    {
                      "match": {
                        "subjectId": "1734030182269853697"
                      }
                    },
                    {
                    "range": {
                      "createDate": {
                        "gte": "2024-07-01T00:00:00",
                        "lte": "2024-07-11T00:00:00"
                      }
                    }
                    }
                  ]
                }
              },
              "sort": [
                {
                  "createDate": {
                    "order": "desc"
                  }
                }
             ],
              "track_total_hits": True,
              "size": 200,
              "from": pnum
            }

       result = self.es.search(index=index_name
                               , doc_type='_doc'
                               , body=body)
       # log.info(result)
       return result


def clean_html_tag(content):
    # todo: 考虑正式场景中是以</p>进行段落划分的
    ori_text = re.sub("(<\/p\s*>)", "\t", content)
    # 处理图片标签
    ori_text = re.sub(r"<img.*?/>", "", ori_text)
    tag_content_list = ori_text.split("\t") if "<p" in ori_text else ori_text
    temp_content_list = []
    if type(tag_content_list) is list:
        for text in tag_content_list:
            bs = BeautifulSoup(text, 'lxml')
            ori_match_content = bs.text.strip()
            temp_content_list.append(ori_match_content)
        match_content = "\n".join(temp_content_list)
    else:
        bs1 = BeautifulSoup(tag_content_list, 'lxml')
        match_content = bs1.text.strip()
        # if "参考文献" not in tag_content_list:
        #     match_content = temp_content
        # else:
        #     match_content = temp_content.split("参考文献")[0]

    return match_content


def preprocess(text: str):
    text = text.strip().strip('\n').strip()
    text = re.sub(' +', '', text)
    text = re.sub('\n+', '\n', text)
    return text


def main(page, p, esMethod):

    result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
    total = result['hits']['total']['value']
    # if total == 0:
    #     log.info('++++已没有数据+++++')
    #     return
    try:
        msglist = result['hits']['hits']
    except:
        log.info(f'error-----{result}')
        return
    log.info(f'---第{page}页{len(msglist)}条数据----共{total}条数据----')

    for mms in msglist:
        id = mms['_id']
        title = mms['_source']['title']
        try:
            content = mms['_source']['content']
        except:
            continue
        try:
            contentWithTag = mms['_source']['contentWithTag']
        except:
            continue
        try:
            clean_content = clean_html_tag(content)
            pre_content = preprocess(clean_content)
        except:
            pre_content = content
        try:
            summary = mms['_source']['summary']
        except:
            summary = ''
        try:
            clean_summary = clean_html_tag(summary)
            pre_summary = preprocess(clean_summary)
        except:
            pre_summary = summary
        try:
            contentRaw = mms['_source']['contentRaw']
        except:
            contentRaw = ''

        try:
            clean_contentRaw = clean_html_tag(contentRaw)
            pre_contentRaw = preprocess(clean_contentRaw)
        except:
            pre_contentRaw = contentRaw
        try:
            titleRaw = mms['_source']['titleRaw']
        except:
            titleRaw = ''
        try:
            summaryRaw = mms['_source']['summaryRaw']
        except:
            summaryRaw = ''
        try:
            clean_summaryRaw = clean_html_tag(summaryRaw)
            pre_summaryRaw = preprocess(clean_summaryRaw)
        except:
            pre_summaryRaw = summaryRaw

        log.info(f'{id}--{title}---')

        labels = mms['_source']['labels']
        tags = []
        for label in labels:
            label_name = label['labelMark']
            if label_name == "dynamic_tags":
                relationName = label['relationName']
                tags.append(relationName)
            else:
                continue
        info_tags = ','.join(tags)

        # 存入数据库
        dic = {
            "id": id,
            "标题": title,
            "摘要": pre_summary,
            "内容": pre_content,
            "带标签内容": contentWithTag,
            "标题译文": titleRaw,
            "摘要译文": pre_summaryRaw,
            "内容译文": pre_contentRaw,
            "标签": info_tags,
        }
        db_storage.insert_one(dic)

def run_threads(num_threads,esMethod,j):
    threads = []

    for i in range(num_threads):
        page = j + i + 1
        p = j + i * 200
        thread = threading.Thread(target=main, args=(page, p, esMethod))

        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()



if __name__ == "__main__":
    j = 0
    for i in range(2):
        esMethod = EsMethod()
        # result = esMethod.queryatt(index_name=esMethod.index_name, pnum=p)
        # total = result['hits']['total']['value']
        # if total == 0:
        #     log.info('++++已没有数据+++++')
        #     break
        start = time.time()
        num_threads = 5
        run_threads(num_threads, esMethod, j)
        j += 1000

        log.info(f'5线程 每个处理200条数据 总耗时{time.time() - start}秒')