
import json
import random
import re
import time

import fitz
import pymysql
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from datetime import datetime
from base import BaseCore
# from fdfs_client.client import get_tracker_conf, Fdfs_client

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

cnx = baseCore.cnx
cursor = baseCore.cursor

cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_
# tracker_conf = get_tracker_conf('./client.conf')
# client = Fdfs_client(tracker_conf)

taskType = '企业公告/证监会/新三板'

#todo:股转公告和挂牌审核包含在公司公告中，没有单独的id
type_map = {
    'zljgcs':'自律监管措施',
    'wxh':'问询函',
    'jlcf':'纪律处分',
    '9506':'公司公告',
    '9509':'公司公告',
    '9503':'公司公告',
    '9504':'公司公告',
    '9505':'公司公告',
    '9510':'公司公告',
    '9520':'公司公告',
    '9605':'公司公告',
    '9533':'公司公告',
}

def secrchATT(item_id, name, type_id):
    sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s '''
    cursor_.execute(sel_sql, (item_id, name, type_id))
    selects = cursor_.fetchone()
    return selects


# 插入到att表 返回附件id
def tableUpdate(retData, com_name, year, pdf_name, num):
    item_id = retData['item_id']
    type_id = retData['type_id']
    group_name = retData['group_name']
    path = retData['path']
    full_path = retData['full_path']
    category = retData['category']
    file_size = retData['file_size']
    status = retData['status']
    create_by = retData['create_by']
    page_size = retData['page_size']
    create_time = retData['create_time']
    order_by = num
    selects = secrchATT(item_id, pdf_name, type_id)

    if selects:
        log.info(f'com_name:{com_name}已存在')
        id = selects[0]
        return id
    else:
        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
            create_time, page_size)

        cursor_.execute(Upsql, values)  # 插入
        cnx_.commit()  # 提交
        log.info("更新完成:{}".format(Upsql))
        selects = secrchATT(item_id, pdf_name, type_id)
        id = selects[0]
        return id

def RequestUrl(url, payload, social_code,start_time):
    # ip = get_proxy()[random.randint(0, 3)]
    pattern = r"\(\[(.*?)\]\)"
    for m in range(0, 3):
        try:
            response = requests.post(url=url, headers=headers, data=payload)  # ,proxies=ip)
            response.encoding = response.apparent_encoding
            break
        except Exception as e:
            log.error(f"request请求异常----{m}-----{e}")
            pass

    # 检查响应状态码
    if response.status_code == 200:
        # 请求成功，处理响应数据
        # print(response.text)
        soup = BeautifulSoup(response.text, 'html.parser')
        match = re.search(pattern, str(soup))
        if match:
            retJsonData = match.group(1)
            retJsonData = json.loads(retJsonData)
        # retJsonData = response.json()
        pass
    else:
        # 请求失败，输出错误信息
        log.error('请求失败:', url)
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
        retJsonData = ''
    return retJsonData

def getPages(url,com_code):
    payload = f"startTime=&page=1&companyCd={com_code}&keyword=&disclosureType%5B%5D=9506&disclosureType%5B%5D=9509&disclosureType%5B%5D=9503&disclosureType%5B%5D=9504&disclosureType%5B%5D=9505&disclosureType%5B%5D=9510&disclosureType%5B%5D=9520&disclosureType%5B%5D=9605&disclosureType%5B%5D=9533&wxhType=wxh&zljgcsType=zljgcs&jlcfType=jlcf&newThreeArray%5B%5D=0&newThreeArray%5B%5D=1&newThreeArray%5B%5D=2&siteId=1&sortfield=publishDate&sorttype=desc&keyword1="
    retJsonData = RequestUrl(url, payload, social_code, start_time)
    # 第一次请求获取页数
    # print(retJsonData)
    totalPages = retJsonData['listInfo']['totalPages']
    print(totalPages)
    return totalPages

def InsterInto(short_name, social_code, pdf_url):
    inster = False

    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='全国中小企业股份转让系统' and type='1' '''
    cursor.execute(sel_sql, (social_code, pdf_url))
    selects = cursor.fetchone()
    if selects:
        print(f'com_name:{short_name}、{pdf_url}已存在')
        return inster

    # 信息插入数据库
    try:
        insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,create_time) values(%s,%s,%s,%s,now())'''

        list_info = [
            social_code,
            pdf_url,
            '全国中小企业股份转让系统',
            '1',
        ]
        #144数据库
        cursor.execute(insert_sql, tuple(list_info))
        cnx.commit()
        insert = True
        return insert
    except:
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
        return insert


def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
    #上传至文件服务器
    retData = baseCore.upLoadToServe(pdf_url,8,social_code)
    #附件插入att数据库
    if retData['state']:
        pass
    else:
        log.info(f'====pdf解析失败====')
        return False
    num = num + 1
    att_id = tableUpdate(retData,com_name,year,pdf_name,num)
    content = retData['content']
    if retData['state']:
        pass
    else:
        log.info(f'====pdf解析失败====')
        return False

    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_news = {
        'attachmentIds': att_id,
        'author': '',
        'content': content,
        'contentWithTag': '',
        'createDate': time_now,
        'deleteFlag': '0',
        'id': '',
        'keyWords': '',
        'lang': 'zh',
        'origin': '全国中小企业股份转让系统',
        'publishDate': pub_time,
        'sid': '1684032033495392257',
        'sourceAddress': pdf_url,  # 原文链接
        'summary': '',
        'title': pdf_name,
        'type': 3,
        'socialCreditCode': social_code,
        'year': year
    }
    # print(dic_news)
    # 将相应字段通过kafka传输保存
    try:
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
        kafka_result = producer.send("researchReportTopic", json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

        print(kafka_result.get(timeout=10))

        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        print(dic_result)
        return True
    except Exception as e:
        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
        print(dic_result)
        return False

# 采集信息
def SpiderByZJH(url, dic_info, start_time,num):  # dic_info 数据库中获取到的基本信息
    okCount = 0
    errorCount = 0
    social_code = dic_info[2]
    short_name = dic_info[4]
    com_name = dic_info[1]

    totalPages = getPages(url, com_code)
    for i in range(0, int(totalPages)):
        payload = f"startTime=&page={i}&companyCd={com_code}&keyword=&disclosureType%5B%5D=9506&disclosureType%5B%5D=9509&disclosureType%5B%5D=9503&disclosureType%5B%5D=9504&disclosureType%5B%5D=9505&disclosureType%5B%5D=9510&disclosureType%5B%5D=9520&disclosureType%5B%5D=9605&disclosureType%5B%5D=9533&wxhType=wxh&zljgcsType=zljgcs&jlcfType=jlcf&newThreeArray%5B%5D=0&newThreeArray%5B%5D=1&newThreeArray%5B%5D=2&siteId=1&sortfield=publishDate&sorttype=desc&keyword1="
        retjson = RequestUrl(url, payload, social_code, start_time)
        content_list = retjson['listInfo']['content']
        for rp in content_list:
            pdf_url = 'https://www.neeq.com.cn' + rp['destFilePath']
            name_pdf = rp['disclosureTitle']
            rp_type = type_map[rp['disclosureType']]
            publishDate = rp['publishDate']
            year = publishDate[:4]
            # 数据入库
            insert = InsterInto(short_name, social_code, name_pdf)
            if insert:
                #     # 公告信息列表
                #     okCount = okCount + 1
                # 解析PDF内容，先获取PDF链接 下载 解析成功，解析失败 ，传输成功，传输失败
                log.info(f'======={short_name}===========插入公告库成功')
                result = GetContent(pdf_url, name_pdf, social_code, year, publishDate, start_time, com_name, num)

                if result:
                    # 公告信息列表
                    okCount = okCount + 1
                    log.info(f'{short_name}==============解析传输操作成功')
                    state = 1
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '')
                    pass
                else:
                    errorCount += 1
                    # time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    log.error(f'{short_name}=============解析或传输操作失败')
                    continue
            else:
                log.info(f'======={short_name}===========已存在')
                continue


if __name__ == '__main__':
    num = 0
    headers = {
        'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Content-Length': '442',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie': 'HOY_TR=FCEXZOPIKBGTYHDL,945C236781ABDFE0,xfslaodpytzTmieq; Hm_lvt_b58fe8237d8d72ce286e1dbd2fc8308c=1694480321; Hm_lpvt_b58fe8237d8d72ce286e1dbd2fc8308c=1694597182',
        'Host': 'www.neeq.com.cn',
        'Origin': 'https://www.neeq.com.cn',
        'Pragma': 'no-cache',
        'Referer': 'https://www.neeq.com.cn/products/neeq_listed_companies/related_announcement.html?companyCode=430054',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
        'sec-ch-ua': '"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
    }

    dic_parms = {}
    # 读取数据库获取股票代码 简称 以及 社会信用代码
    while True:
        start_time = time.time()
        # # 获取企业信息
        # # social_code = baseCore.redicPullData('NoticeEnterpriseFbs:gnqy_socialCode')
        social_code = '9110000071092841XX'
        com_code = '430045'
        short_name = '超毅网络'
        dic_info = {}
        # # 判断 如果Redis中已经没有数据，则等待
        # if social_code == None:
        #     time.sleep(20)
        #     continue
        # dic_info = baseCore.getInfomation(social_code)
        # count = dic_info[16]
        url = 'https://www.neeq.com.cn/disclosureInfoController/productInfoResult.do'
        #翻页 page 0~ 25  totalPages

        SpiderByZJH(url, dic_info, start_time, num)

    cursor.close()
    cnx.close()
    # cursor_.close()
    # cnx_.close()
    # 释放资源
    baseCore.close()
