
import json
import os
import re
import subprocess
import sys
import time
import uuid
from datetime import datetime
import random

import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from retry import retry

from base import BaseCore
from obs import ObsClient
import fitz
from urllib.parse import unquote
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

cnx = baseCore.cnx
cursor = baseCore.cursor

cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_

taskType = '企业公告/证监会'

obsClient = ObsClient(
        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
    )
pathType = 'QYNotice/'

def getuuid():
    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
    return get_timestamp_uuid

#获取文件大小
def convert_size(size_bytes):
    # 定义不同单位的转换值
    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
    i = 0
    while size_bytes >= 1024 and i < len(units)-1:
        size_bytes /= 1024
        i += 1
    return f"{size_bytes:.2f} {units[i]}"

def uptoOBS(pdf_url,pdf_name,type_id,social_code):
    headers = {}
    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
               'full_path': '',
               'category': 'pdf', 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
               'create_time': '', 'page_size': '', 'content': ''}
    headers['User-Agent'] = baseCore.getRandomUserAgent()
    for i in range(0, 3):
        try:
            response = requests.get(pdf_url, headers=headers, verify=False, timeout=20)
            file_size = int(response.headers.get('Content-Length'))
            break
        except:
            time.sleep(3)
            continue
    page_size = 0
    name = str(getuuid()) + '.pdf'
    now_time = time.strftime("%Y-%m")
    try:
        result = getOBSres(pathType, name, response)
    except:
        log.error(f'OBS发送失败')
        return retData
    try:
        with fitz.open(stream=response.content, filetype='pdf') as doc:
            page_size = doc.page_count
            for page in doc.pages():
                retData['content'] += page.get_text()
    except:
        log.error(f'文件损坏')
        return retData

    if page_size < 1:
        # pdf解析失败
        # print(f'======pdf解析失败=====')
        return retData
    else:
        try:
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
            retData['path'] = result['body']['objectUrl'].split('.com')[1]
            retData['full_path'] = result['body']['objectUrl']
            retData['file_size'] = convert_size(file_size)
            retData['create_time'] = time_now
            retData['page_size'] = page_size
        except Exception as e:
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
            return retData

        return retData

@retry(tries=3, delay=1)
def getOBSres(pathType,name, response):
    result = obsClient.putContent('zzsn', pathType + name, content=response.content)
    # resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
    return result


def secrchATT(item_id, retData, type_id,order_by):
    sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s and order_by=%s '''
    cursor_.execute(sel_sql, (item_id, retData['path'], type_id,order_by))
    selects = cursor_.fetchone()
    return selects

# 插入到att表 返回附件id
def tableUpdate(retData, com_name, year, pdf_name, num,pub_time,origin):
    item_id = retData['item_id']
    type_id = retData['type_id']
    group_name = retData['group_name']
    path = retData['path']
    full_path = retData['full_path']
    category = retData['category']
    file_size = retData['file_size']
    status = retData['status']
    create_by = retData['create_by']
    page_size = retData['page_size']
    create_time = retData['create_time']
    order_by = num
    # selects = secrchATT(item_id, pdf_name, type_id)
    #
    # if selects:
    #     log.info(f'pdf_name:{pdf_name}已存在')
    #     id = ''
    #     return id
    # else:
    try:
        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name,publish_time,source) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            year, pdf_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
            create_time, page_size, full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1], 'zzsn',
            pub_time, origin)
        cursor_.execute(Upsql, values)  # 插入
        cnx_.commit()  # 提交
    except Exception as e:
        print(e)
    log.info(f"更新完成:{item_id}===={pdf_name}")
    selects = secrchATT(item_id, retData, type_id,order_by)
    id = selects[0]
    return id

@retry(tries=3, delay=5)
def RequestUrl(url, payload, social_code,start_time):
    ip = baseCore.get_proxy()
    # proxy = {'https': 'http://127.0.0.1:8888', 'http': 'http://127.0.0.1:8888'}

    response = requests.post(url=url, headers=headers, data=payload, proxies=ip)
    # response = requests.post(url=url, data=payload)
    response.encoding = response.apparent_encoding
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        raise


def getUrl(code, url_parms, Catagory2_parms):
    # 深市
    if code[0] == '2' or code[0] == '0' or code[0] == '3':
        url = f'http://eid.csrc.gov.cn/{url_parms[1]}/index_f.html'
        Catagory2 = Catagory2_parms[1]
        # 构建POST请求的参数，prodType --- 股票代码
        payload2 = {
            'prodType': f'{code}',
            'prodType2': '代码/简称/拼音缩写 ',
            'keyWord': '',
            'keyWord2': '关键字',
            'startDate': '',
            'startDate2': '请输入开始时间',
            'endDate': '',
            'endDate2': '请输入结束时间',
            'selCatagory2': f'{Catagory2}',
            'selBoardCode0': '',
            'selBoardCode': ''
        }
        dic_parms = {
            'code': code,
            'url': url,
            'Catagory2': Catagory2,
            'payload': payload2
        }
    # 沪市
    if code[0] == '9' or code[0] == '6':
        url = f'http://eid.csrc.gov.cn/{url_parms[0]}/index_f.html'
        Catagory2 = Catagory2_parms[0]
        payload1 = {
            'prodType': f'{code}',
            'prodType2': '代码/简称/拼音缩写 ',
            'keyWord': '',
            'keyWord2': '关键字',
            'startDate': '',
            'startDate2': '请输入开始时间',
            'endDate': '',
            'endDate2': '请输入结束时间',
            'selCatagory2': f'{Catagory2}',
            'selCatagory3': '',
            'selBoardCode0': '',
            'selBoardCode': '',
        }
        dic_parms = {
            'code': code,
            'url': url,
            'Catagory2': Catagory2,
            'payload': payload1
        }

    # 北交所
    if code[0] == '8' or code[0] == '4':
        try:
            url = f'http://eid.csrc.gov.cn/{url_parms[2]}/index_f.html'
        except:
            return
        Catagory2 = Catagory2_parms[2]
        payload3 = {
            'prodType': f'{code}',
            'prodType2': '代码/简称/拼音缩写 ',
            'keyWord': '',
            'keyWord2': '关键字',
            'startDate': '',
            'startDate2': '请输入开始时间',
            'endDate': '',
            'endDate2': '请输入结束时间',
            'selCatagory2': f'{Catagory2}'
        }
        dic_parms = {
            'code': code,
            'url': url,
            'Catagory2': Catagory2,
            'payload': payload3
        }
    return dic_parms

def ifInstert(short_name, social_code, pdf_url):
    ifexist = True

    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='证监会' and type='1' '''
    cursor.execute(sel_sql, (social_code, pdf_url))
    selects = cursor.fetchone()
    #如果数据库中存在 则跳过
    if selects:
        ifexist = False
        log.info(f'com_name:{short_name}、{pdf_url}已存在')
        return ifexist
    else:
        return ifexist

def InsterInto(social_code, pdf_url,pub_time,pdf_name):
    insert = False
    # 信息插入数据库
    try:
        insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,create_time) values(%s,%s,%s,%s,%s,%s,now())'''

        list_info = [
            social_code,
            pdf_url,
            '证监会',
            '1',
            pub_time,
            pdf_name
        ]
        #144数据库
        cursor.execute(insert_sql, tuple(list_info))
        cnx.commit()
        insert = True
        return insert
    except:
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
        return insert

def GetContent(pdf_url, pdf_name, social_code, year, pub_time, start_time,com_name,num):
    #判断文件是否已经存在obs服务器中
    # file_path = 'QYNotice//浙江国祥股份有限公司首次公开发行股票并在主板上市暂缓发行公告'
    now_time = time.strftime("%Y-%m")
    # file_path = 'QYNotice/'+pdf_name
    # response = obsClient.getObjectMetadata('zzsn', file_path)
    # if response.status >= 300:
    #     log.info('=====文件不存在obs=====')
    #     pass
    # else:
    #     log.info(f'=====文件存在obs========{file_path}')
    #     return False
    #上传至华为云服务器
    retData = uptoOBS(pdf_url,pdf_name,8,social_code)
    #附件插入att数据库
    if retData['state']:
        pass
    else:
        log.info(f'====pdf解析失败====')
        return False
    num = num + 1
    origin = '证监会'
    att_id = tableUpdate(retData,com_name,year,pdf_name,num,pub_time,origin)
    if att_id:
        pass
    else:
        return False
    content = retData['content']
    lang = baseCore.detect_language(content)
    if lang == 'cn':
        lang = 'zh'
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_news = {
        'attachmentIds': att_id,
        'author': '',
        'content': content,
        'contentWithTag': '',
        'createDate': time_now,
        'deleteFlag': '0',
        'id': '',
        'keyWords': '',
        'lang': lang,
        'origin': origin,
        'publishDate': pub_time,
        'sid': '1684032033495392257',
        'sourceAddress': pdf_url,  # 原文链接
        'summary': '',
        'title': pdf_name.replace('.pdf',''),
        'type': 3,
        'socialCreditCode': social_code,
        'year': year
    }
    # print(dic_news)
    # 将相应字段通过kafka传输保存
    try:
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024*1024*20)
        kafka_result = producer.send("researchReportNoticeTopic", json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

        print(kafka_result.get(timeout=10))

        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        log.info(dic_result)
        return True
    except Exception as e:
        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, 'Kafka操作失败')
        log.info(dic_result)
        return False

# 采集信息
def SpiderByZJH(url, payload, dic_info, start_time,num):  # dic_info 数据库中获取到的基本信息
    social_code = dic_info[2]
    short_name = dic_info[4]
    if short_name == 'None':
        short_name = dic_info[1]
    com_name = dic_info[1]
    try:
        soup = RequestUrl(url, payload, social_code, start_time)
    except Exception as e:
        # 请求失败，输出错误信息
        log.error(f'请求失败:{url}----{e}')
        #重新放入redis
        baseCore.rePutIntoR('NoticeEnterprise:gnqy_socialCode_add', social_code)
        time.sleep(random.randint(60, 120))
        # 获取当前进程pid
        current_pid = baseCore.getPID()
        # todo: 重新启动新进程，杀死当前进程
        subprocess.Popen([sys.executable] + sys.argv)
        os.kill(current_pid, 9)
        soup = ''
    if soup == '':
        return
    # 判断查找内容是否存在
    try:
        is_exist = soup.find('div',class_='con').text
        if is_exist == '没有查询到数据':
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, url, '没有查询到数据')
            return
    except:
        pass

    # 先获取页数
    page = soup.find('div', class_='pages').find('ul', class_='g-ul').text

    total = re.findall(r'\d+', page)[0]

    r_page = int(total) % 15
    if r_page == 0:
        Maxpage = int(total) // 15
    else:
        Maxpage = int(total) // 15 + 1
    log.info(f'{short_name}====={code}===========一共{total}条,{Maxpage}页')
    # # 首页和其他页不同，遍历 如果是首页 修改一下链接
    if Maxpage < 50:
        pass
    else:
        Maxpage = 50
    for i in range(1,Maxpage+1):
        log.info(f'==========正在采集第{i}页=========')
        if i == 1:
            href = url
        else:
            # http://eid.csrc.gov.cn/101811/index_3_f.html
            href = url.split('index')[0] + f'index_{i}_f.html'
        try:
            soup = RequestUrl(href, payload, social_code, start_time)
        except:
            # 请求失败，输出错误信息
            log.error(f'请求失败:{url}')
            # 重新放入redis
            baseCore.rePutIntoR('NoticeEnterprise:gnqy_socialCode_add', social_code)
            time.sleep(random.randint(60, 120))
            # state = 0
            # takeTime = baseCore.getTimeCost(start_time, time.time())
            # baseCore.recordLog(social_code, taskType, state, takeTime, url, '请求失败')
            # soup = ''
        if soup == '':
            continue
        tr_list = soup.find('div', id='txt').find_all('tr')
        # pageIndex = 0
        for tr in tr_list[1:]:
            # pageIndex += 1
            td_list = tr.find_all('td')
            pdf_url_info = td_list[2]
            # print(pdf_url)
            pdf_url = pdf_url_info['onclick'].strip('downloadPdf1(').split(',')[0].strip('\'')
            name_pdf = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[1].strip('\'') + '.pdf'

            pub_time_ = pdf_url_info['onclick'].strip('downloadPdf1(').split('\',')[2].strip('\'')
            #todo:判断发布日期是否是日期格式
            pattern = r"^\d{4}-\d{2}-\d{2}$"  # 正则表达式匹配YYYY-MM-DD格式的日期
            date_time_pattern = r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"
            if re.match(pattern, pub_time_):
                pass
            else:
                if re.match(date_time_pattern, pub_time_):
                    pass
                else:
                    continue
            # 将时间年月日字符串转换为datetime对象
            date_object = datetime.strptime(pub_time_, "%Y-%m-%d")

            # 将datetime对象转换为年月日时分秒字符串
            pub_time = date_object.strftime("%Y-%m-%d %H:%M:%S")
            year = pub_time[:4]
            report_type = td_list[4].text.strip()

            # 判断数据库中是否有该条资讯
            ifexist = ifInstert(short_name, social_code, pdf_url)
            #如果不存在 ifexist = True
            # ifexist = True
            if ifexist:
                # 解析PDF内容，先获取PDF链接 下载 解析成功，解析失败 ，传输成功，传输失败
                result = GetContent(pdf_url, name_pdf, social_code, year, pub_time, start_time,com_name,num)

                if result:
                    # 公告信息列表
                    log.info(f'{short_name}==============解析传输操作成功')
                    state = 1
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '成功')

                    #发送kafka成功之后 再插入数据库
                    insert = InsterInto(social_code,pdf_url,pub_time,name_pdf)
                    if insert:
                        log.info(f'===={social_code}========{name_pdf}=====插入库成功')

                else:
                    continue
            else:
                log.info(f'======={short_name}========{code}===已存在')
                # continue
                break

if __name__ == '__main__':
    num = 0
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        # 'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        # 'Content-Length': '380',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Cookie': 'acw_tc=2760825217168606497214655ec9cb62ffa696c5367ec9f402d2086a0287ae; tgw_l7_route=125d8c38fe1eb06650b04b0cc6f51270',
        # 'Host': 'eid.csrc.gov.cn',
        # 'Origin': 'http://eid.csrc.gov.cn',
        # 'Pragma': 'no-cache',
        'Referer': 'http://eid.csrc.gov.cn/101111/index_1_f.html',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
    }

    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Cookie': 'ba17301551dcbaf9_gdp_user_key=; gdp_user_id=gioenc-4c21c93a%2Ccdgd%2C5c8b%2Cc32e%2C8g0229546a17; ba17301551dcbaf9_gdp_session_id_dc777856-a24e-4008-a8a6-af88d75bae2b=true; ba17301551dcbaf9_gdp_sequence_ids={%22globalKey%22:3%2C%22VISIT%22:2%2C%22PAGE%22:2}; acw_tc=71dbb29c16908906086793104e8117f44af84d756f68927c202e9a70b1',
        'Host': 'static.sse.com.cn',
        'Pragma': 'no-cache',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
    }

    dic_parms = {}
    # 读取数据库获取股票代码 简称 以及 社会信用代码
    while True:
        start_time = time.time()
        # 获取企业信息
        # social_code = baseCore.redicPullData('NoticeEnterprise:gnqy_socialCode')
        social_code = '91370000163446410B'
        # 判断 如果Redis中已经没有数据，则等待
        if social_code == None:
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)

        count = dic_info[17]
        # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
        # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html

        # 发行上市公告,北交所没有该栏目
        url_parms = ['101110', '101810']
        Catagory2_parms = ['9603', '10057']
        # 临时报告
        url_parms_ls = ['101112', '101812', '102612']
        Catagory2_parms_ls = ['9605', '10059', '10163']

        # 根据股票代码选链接
        # 股票代码0、2、3开头的为深圳交易所，6、9开头的为上海交易所，4、8开头的为北京交易所
        code = dic_info[3]
        short_name = dic_info[4]
        if short_name == 'None':
            short_name = dic_info[1]
        com_name = dic_info[1]
        dic_parms = getUrl(code, url_parms, Catagory2_parms)
        dic_parms_ls = getUrl(code, url_parms_ls, Catagory2_parms_ls)

        if dic_parms:
            start_time_cj = time.time()
            log.info(f'======开始处理{com_name}=====发行公告=======')
            SpiderByZJH(dic_parms["url"], dic_parms["payload"], dic_info, start_time,num)
            log.info(f'{code}==========={short_name},{com_name},发行公告,耗时{baseCore.getTimeCost(start_time_cj, time.time())}')
            start_time_ls = time.time()
            log.info(f'======开始处理{com_name}=====临时报告=======')
            SpiderByZJH(dic_parms_ls['url'], dic_parms_ls['payload'], dic_info, start_time,num)
            log.info(f'{code}==========={short_name},{com_name},临时报告,耗时{baseCore.getTimeCost(start_time_ls, time.time())}')
            # UpdateInfoSql(retData,retData_ls,social_code)
            # log.info(f'{code}================更新成功')
            end_time = time.time()
            log.info(f'{short_name} ---- 该企业耗时 ---- {baseCore.getTimeCost(start_time, end_time)}-----------')
            count += 1
            runType = 'NoticeReportCount'
            baseCore.updateRun(social_code, runType, count)
        break
    cursor.close()
    cnx.close()
    baseCore.close()
