import datetime
import json
import os
import re
import time
import uuid
from urllib.parse import unquote

import pymongo
import redis
from bs4 import BeautifulSoup
from fitz import fitz
from obs import ObsClient
from retry import retry

from base import BaseCore
import requests

db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='ZZsn@9988').RESCenter[
    'REITsFundAnncmnt']
obsClient = ObsClient(
    access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
    secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
    server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
)
baseCore = BaseCore.BaseCore()
cursor_ = baseCore.cursor_
cnx_ = baseCore.cnx_
cursor = baseCore.cursor
cnx = baseCore.cnx
r = baseCore.r
log = baseCore.getLogger()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62',
    # 'cookie': 'OptanonAlertBoxClosed=2023-08-29T09:50:42.503Z; AMCV_DD0356406298B0640A495CB8%40AdobeOrg=179643557%7CMCIDTS%7C19599%7CMCMID%7C90834671594036426831047706481131374722%7CvVersion%7C5.5.0; sclang=zh-CN; OptanonConsent=isGpcEnabled=0&datestamp=Wed+Aug+30+2023+13%3A52%3A11+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202303.2.0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0004%3A1%2CC0002%3A1&geolocation=CN%3BHA&AwaitingReconsent=false;',
    'Content-Type': 'application/x-www-form-urlencoded'
}


class obsOperate():
    def __init__(self, cursor_, cnx_, log):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62',
            # 'cookie': 'OptanonAlertBoxClosed=2023-08-29T09:50:42.503Z; AMCV_DD0356406298B0640A495CB8%40AdobeOrg=179643557%7CMCIDTS%7C19599%7CMCMID%7C90834671594036426831047706481131374722%7CvVersion%7C5.5.0; sclang=zh-CN; OptanonConsent=isGpcEnabled=0&datestamp=Wed+Aug+30+2023+13%3A52%3A11+GMT%2B0800+(%E4%B8%AD%E5%9B%BD%E6%A0%87%E5%87%86%E6%97%B6%E9%97%B4)&version=202303.2.0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0004%3A1%2CC0002%3A1&geolocation=CN%3BHA&AwaitingReconsent=false;',
            'Content-Type': 'application/x-www-form-urlencoded'
        }
        self.cursor_ = cursor_
        self.cnx_ = cnx_
        self.log = log

    def secrchATT(self, item_id, file_name, type_id, order_by):
        sel_sql = '''select id from clb_sys_attachment where item_id = %s and name = %s and type_id=%s and order_by=%s '''
        self.cursor_.execute(sel_sql, (item_id, file_name, type_id, order_by))
        selects = self.cursor_.fetchone()
        return selects

    # 插入到att表 返回附件id
    def tableUpdate(self, retData, com_name, file_name, num, pub_time):
        item_id = retData['item_id']
        type_id = retData['type_id']
        group_name = retData['group_name']
        path = retData['path']
        full_path = retData['full_path']
        category = retData['category']
        file_size = retData['file_size']
        status = retData['status']
        create_by = retData['create_by']
        page_size = retData['page_size']
        create_time = retData['create_time']
        order_by = num

        Upsql = '''insert into clb_sys_attachment(name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,object_key,bucket_name,publish_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            file_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
            create_time, path, 'zzsn', pub_time)

        self.cursor_.execute(Upsql, values)  # 插入
        self.cnx_.commit()  # 提交
        self.log.info("更新完成:{}".format(Upsql))
        selects = self.secrchATT(item_id, file_name, type_id, order_by)
        id = selects[0]
        return id, full_path

    def getuuid(self):
        get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
        return get_timestamp_uuid

    # 获取文件大小
    def convert_size(self, size_bytes):
        # 定义不同单位的转换值
        units = ['bytes', 'KB', 'MB', 'GB', 'TB']
        i = 0
        while size_bytes >= 1024 and i < len(units) - 1:
            size_bytes /= 1024
            i += 1
        return f"{size_bytes:.2f} {units[i]}"

    @retry(tries=5, delay=10)
    def getRes(self, file_href):
        response = requests.get(file_href, headers=self.headers)
        if response.status_code != 200:
            raise
        return response

    @retry(tries=5, delay=10)
    def sendOBS(self, file_name, response):
        result = obsClient.putContent('zzsn', 'PolicyDocument/' + file_name, content=response.content)
        return result

    def uptoOBS(self, file_href, item_id, file_name):

        category = os.path.splitext(file_href)[1]
        retData = {'state': False, 'type_id': 15, 'item_id': item_id, 'group_name': '', 'path': '',
                   'full_path': '',
                   'category': category, 'file_size': '', 'status': 1, 'create_by': 'LiuLiYuan',
                   'create_time': '', 'page_size': '', 'content': ''}
        try:
            response = self.getRes(file_href)
        except:
            self.log.error('文件获取失败')
            return retData

        try:
            with fitz.open(stream=response.content, filetype='pdf') as doc:
                for page in doc.pages():
                    retData['content'] += page.get_text()
        except:
            self.log.error(f'文件解析失败')
            return retData

        file_size = int(response.headers.get('Content-Length'))
        file_name = str(self.getuuid()) + category
        try:
            result = self.sendOBS(file_name, response)
        except:
            self.log.error(f'obs上传失败')
            return retData

        try:
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
            retData['path'] = result['body']['objectUrl'].split('.com')[1]
            retData['full_path'] = unquote(result['body']['objectUrl'])
            retData['file_size'] = self.convert_size(file_size)
            retData['create_time'] = time_now
        except Exception as e:
            print(f'error:{e}')
            return retData
        return retData

@retry(tries=3,delay=30)
def getToken():
    tokenUrl = r.spop('cookie:HKEXurl')
    tokenUrl = tokenUrl.decode('utf-8')
    token = re.findall('token=(.*?)&',tokenUrl)[0].replace('+','%2b').replace('/','%2f')
    callback = re.findall('callback=(.*?)&',tokenUrl)[0]
    time_ = re.findall('_=(\d+)',tokenUrl)[0]
    return token,callback,time_

def getID(code):
    url = 'https://www1.hkexnews.hk/search/titlesearch.xhtml?lang=zh'
    data_post = {
        'current_page': '1',
        'stock_market': 'HKEX',
        'rdo_SelectSortBy': 'DateTime',
        'txt_stock_code': f'{code}',
        'rdo_SelectDocType': '',
        'sel_DocTypePrior2006': '-1',
        'sel_DocTypeAfter2006': '',
        'sel_tier_1': '-2',
        'sel_tier_2': '-2',
        'sel_tier_2_group': '-2',
        'IsFromNewList': False,
        'txtKeyWord': '',
        'sel_DateOfReleaseFrom_d': '01',
        'sel_DateOfReleaseFrom_m': '04',
        'sel_DateOfReleaseFrom_y': '1999',
        'sel_DateOfReleaseTo_d': '04',
        'sel_DateOfReleaseTo_m': '12',
        'sel_DateOfReleaseTo_y': '2023',
    }
    req = requests.post(url, headers=headers, data=data_post)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    id = soup.find('form', attrs={'id': 'j_idt5'}).find('input', attrs={'id': 'stockId'}).get('value')
    total = soup.find('div', class_='component-loadmore-leftPart__container').text
    total = int(re.findall('共有\s+(\d+)\s+紀錄', total)[0])
    req.close()
    return id, total

def getCodeList():
    token,callback,time_ = getToken()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        'Accept': '*/*',
        'Referer': 'https://www.hkex.com.hk/'
    }
    code_list = []
    url = f'https://www1.hkex.com.hk/hkexwidget/data/getreitfilter?lang=chi&token={token}&sort=5&order=0&qid=1701506928884&callback={callback}&_={time_}'
    req = requests.get(url, headers=headers)
    req.encoding = req.apparent_encoding
    data_json = re.findall('\((.*?)\)', req.text)[0]
    data_json = json.loads(data_json)['data']['stocklist']
    req.close()
    for data_ in data_json:
        code = data_['sym'].rjust(5, '0')
        name = baseCore.hant_2_hans(data_['nm'])
        ric = data_['ric']
        code_list.append([code, name,ric])
    req.close()
    return code_list

def getJson(id,page):
    date = datetime.datetime.today().strftime('%Y%m%d')
    url = f'https://www1.hkexnews.hk/search/titleSearchServlet.do?sortDir=0&sortByOptions=DateTime&category=0&market=SEHK&stockId={id}&documentType=-1&fromDate=19990401&toDate={date}&title=&searchType=0&t1code=-2&t2Gcode=-2&t2code=-2&rowRange={page}&lang=zh'
    req = requests.get(url, headers=headers)
    req.encoding = req.apparent_encoding
    data_json = json.loads(req.json()['result'])
    req.close()
    return data_json

def doJob(obsOperate):
    code_list = getCodeList()
    for codes in code_list:
        code = codes[0]
        name = codes[1]
        ric = codes[2]
        id, total = getID(code)
        num = 1
        log.info(f'开始采集==={name}==={code}===共{total}条数据')
        for page in range(100, total + 1, 100):
            data_json = getJson(id,page)
            for data_ in data_json:
                title = baseCore.hant_2_hans(data_['TITLE'])
                date = data_['DATE_TIME']
                date = datetime.datetime.strptime(date, '%d/%m/%Y %H:%M')
                href = 'https://www1.hkexnews.hk' + data_['FILE_LINK']
                file_title = title + '.pdf'
                is_insert = db_storage.find_one({'code': code, 'date': date,'href': href,'exchange':'香港交易所'})
                if is_insert:
                    log.info(f'{code}==={title}===已采集')
                    num += 1
                    continue
                retData = obsOperate.uptoOBS(href, ric, file_title)
                time.sleep(2)
                if not retData['state']:
                    log.error(f'{code}==={title}===公告下载obs失败')
                    continue
                att_id, full_path = obsOperate.tableUpdate(retData, 'RETIs文件', file_title, num, str(date)[:10])
                num += 1
                createDate = datetime.datetime.now().strftime('%Y-%m-%d')
                dic_info = {
                    'code': code,  # 代码
                    'name': name,  # 基金名称
                    'title': title,  # 题目
                    'path': full_path,  # 文件osb位置
                    'href': href,  # 原文链接
                    'content': retData['content'],  # pdf解析内容
                    'date': date,  # 时间（datetime 类型）
                    'strDate': str(date)[:10],  # 时间（字符串类型）
                    'exchange': '香港交易所',  # 交易所
                    'createDate':createDate     # 创建时间
                }
                db_storage.insert_one(dic_info)
                log.info(f'{code}==={title}===采集成功')


if __name__ == '__main__':
    obsOperate = obsOperate(cursor_, cnx_, log)
    doJob(obsOperate)
    baseCore.close()
