import datetime
import re
import time

import numpy as np
import pandas as pd
import pymongo
import requests
import os
import json
import uuid
from urllib.parse import unquote

from fitz import fitz
from kafka import KafkaProducer
from obs import ObsClient
from retry import retry

from base import BaseCore

db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
    'REITsFundAnncmnt']
obsClient = ObsClient(
    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
)
baseCore = BaseCore.BaseCore()
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
cursor = baseCore.cursor
cnx = baseCore.cnx
log = baseCore.getLogger()
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Content-Type': 'application/json',
    'Host': 'www.szse.cn',
    'Origin': 'http://www.szse.cn',
    'Pragma': 'no-cache',
    'Referer': 'http://www.szse.cn/disclosure/fund/notice/index.html',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'X-Request-Type': 'ajax',
    'X-Requested-With': 'XMLHttpRequest',
}
url = 'http://www.szse.cn/api/disc/announcement/annList'


class obsOperate():
    def __init__(self, cursor_, cnx_, log):
        self.headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'Content-Type': 'application/json',
            'Host': 'www.szse.cn',
            'Origin': 'http://www.szse.cn',
            'Pragma': 'no-cache',
            'Referer': 'http://www.szse.cn/disclosure/fund/notice/index.html',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
            'X-Request-Type': 'ajax',
            'X-Requested-With': 'XMLHttpRequest',
        }
        self.cursor_ = cursor_
        self.cnx_ = cnx_
        self.log = log

    def secrchATT(self, item_id, file_name, type_id, order_by):
        sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
        self.cursor_.execute(sel_sql, (item_id, file_name, type_id, order_by))
        selects = self.cursor_.fetchone()
        return selects

    # 插入到att表 返回附件id
    def tableUpdate(self, retData, com_name, file_name, num, pub_time):
        item_id = retData['item_id']
        type_id = retData['type_id']
        group_name = retData['group_name']
        path = retData['path']
        full_path = retData['full_path']
        category = retData['category']
        file_size = retData['file_size']
        status = retData['status']
        create_by = retData['create_by']
        page_size = retData['page_size']
        create_time = retData['create_time']
        order_by = num

        Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
            create_time, path, 'zzsn', pub_time)

        self.cursor_.execute(Upsql, values)  # 插入
        self.cnx_.commit()  # 提交
        self.log.info("更新完成:{}".format(Upsql))
        selects = self.secrchATT(item_id, file_name, type_id, order_by)
        id = selects[0]
        return id, full_path

    def getuuid(self):
        get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
        return get_timestamp_uuid

    # 获取文件大小
    def convert_size(self, size_bytes):
        # 定义不同单位的转换值
        units = ['bytes', 'KB', 'MB', 'GB', 'TB']
        i = 0
        while size_bytes >= 1024 and i < len(units) - 1:
            size_bytes /= 1024
            i += 1
        return f"{size_bytes:.2f} {units[i]}"

    @retry(tries=5, delay=10)
    def getRes(self, file_href):
        response = requests.get(file_href, headers=self.headers)
        if response.status_code != 200:
            raise
        return response

    @retry(tries=5, delay=10)
    def sendOBS(self, file_name, response):
        result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
        return result

    def uptoOBS(self, file_href, item_id, file_name):

        category = os.path.splitext(file_href)[1]
        retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
                   'full_path': '',
                   'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
                   'create_time': '', 'page_size': '', 'content': ''}
        try:
            response = self.getRes(file_href)
        except:
            self.log.error('文件获取失败')
            return retData

        try:
            with fitz.open(stream=response.content, filetype='pdf') as doc:
                for page in doc.pages():
                    retData['content'] += page.get_text()
        except:
            self.log.error(f'文件解析失败')
            return retData

        file_size = int(response.headers.get('Content-Length'))
        file_name = str(self.getuuid()) + category
        try:
            result = self.sendOBS(file_name, response)
        except:
            self.log.error(f'obs上传失败')
            return retData

        try:
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
            retData['path'] = result['body']['objectUrl'].split('.com')[1]
            retData['full_path'] = unquote(result['body']['objectUrl'])
            retData['file_size'] = self.convert_size(file_size)
            retData['create_time'] = time_now
        except Exception as e:
            print(f'error:{e}')
            return retData
        return retData


# 获取代码列表
def getCodeList():
    code_list = []
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Content-Type': 'application/json',
        'Host': 'www.szse.cn',
        'Pragma': 'no-cache',
        'Referer': 'http://www.szse.cn/market/product/list/all/index.html',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        'X-Request-Type': 'ajax',
        'X-Requested-With': 'XMLHttpRequest',
    }
    url = 'http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=1105&TABKEY=tab1&selectJjlb=%E5%9F%BA%E7%A1%80%E8%AE%BE%E6%96%BD%E5%9F%BA%E9%87%91'
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers)
    req.encoding = req.apparent_encoding
    data_list = req.json()[0]['data']
    for data_ in data_list:
        code = re.findall('<u>(.*?)</u>', data_['sys_key'])[0]
        code_list.append(code)
    req.close()
    return code_list


# 获取总页数
def getPageSize(id):
    data_post = {"seDate": ["", ""], "stock": [f"{id}"], "channelCode": ["fundinfoNotice_disc"], "pageSize": 50,
                 "pageNum": 1}
    data_post = json.dumps(data_post)
    ip = baseCore.get_proxy()
    req = requests.post(url, headers=headers, data=data_post)
    req.encoding = req.apparent_encoding
    total = int(req.json()['announceCount'])
    if total % 50 == 0:
        pageSize = int(total / 50)
    else:
        pageSize = int(total / 50) + 1
    req.close()
    return pageSize


# 获取json数据
def getDataList(id, page):
    data_post = {"seDate": ["", ""], "stock": [f"{id}"], "channelCode": ["fundinfoNotice_disc"], "pageSize": 50,
                 "pageNum": page}
    data_post = json.dumps(data_post)
    ip = baseCore.get_proxy()
    req = requests.post(url, headers=headers, data=data_post)
    req.encoding = req.apparent_encoding
    data_list = req.json()['data']
    req.close()
    return data_list


def doJob(obsOperate):
    code_list = getCodeList()
    for code in code_list:
        pageSize = getPageSize(code)
        log.info(f'{code}===共{pageSize}页')
        for page in range(1, pageSize + 1):
            log.info(f'开始采集第{page}页')
            try:
                data_list = getDataList(code, page)
            except:
                log.error(f'第{page}页数据获取失败')
                time.sleep(5)
                continue
            num = 1
            for data in data_list:
                title = data['title']
                name = data['secName'][0]
                file_title = title + '.pdf'
                pub_time = data['publishTime']
                year = pub_time[:4]
                href = 'http://www.szse.cn/api/disc/info/download?id=' + data['id']
                is_insert = db_storage.find_one({'code': code, 'href': href, 'exchange': '深圳证券交易所'})
                if is_insert:
                    log.info(f'{title}===已采集')
                    time.sleep(2)
                    break
                retData = obsOperate.uptoOBS(href, '', file_title)
                time.sleep(2)
                if retData['state']:
                    pass
                else:
                    log.error(f'{code}==={title}===公告下载obs失败')
                    continue
                att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, pub_time)
                num += 1
                dic_news = {
                    'code': code,  # 代码
                    'name': name,  # 简称
                    'title': title,  # 名称
                    'path': full_path,  # obs路径
                    'href': href,  # 原文链接
                    'content': retData['content'],  # pdf解析内容
                    'date': datetime.datetime.strptime(pub_time, '%Y-%m-%d %H:%M:%S'),  # 时间
                    'strDate': pub_time,  # 时间 字符串
                    'exchange': '深圳证券交易所'  # 交易所
                }
                try:
                    db_storage.insert_one(dic_news)
                    log.info(f'{title}===采集成功')
                except:
                    log.error(f'{title}===入库失败')
                time.sleep(2)


if __name__ == '__main__':
    obsOperate = obsOperate(cursor_, cnx_, log)
    doJob(obsOperate)
    baseCore.close()
