import os
import uuid
import fitz
import requests
from bs4 import BeautifulSoup
import time, json
from kafka import KafkaProducer
from obs import ObsClient

from urllib.parse import unquote

from retry import retry
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from base import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cnx = baseCore.cnx
cursor = baseCore.cursor

pathType = 'QYNotice/'


cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_

obsClient = ObsClient(
        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
    )

#获取文件大小
def convert_size(size_bytes):
    # 定义不同单位的转换值
    units = ['bytes', 'KB', 'MB', 'GB', 'TB']
    i = 0
    while size_bytes >= 1024 and i < len(units)-1:
        size_bytes /= 1024
        i += 1
    return f"{size_bytes:.2f} {units[i]}"

def getuuid():
    get_timestamp_uuid = uuid.uuid1()  # 根据 时间戳生成 uuid , 保证全球唯一
    return get_timestamp_uuid

def uptoOBS(pdf_url,pdf_name,type_id,social_code):
    headers = {}
    category = os.path.splitext(pdf_url)[1]
    retData = {'state': False, 'type_id': type_id, 'item_id': social_code, 'group_name': '', 'path': '',
               'full_path': '',
               'category': category, 'file_size': '', 'status': 1, 'create_by': 'XueLingKun',
               'create_time': '', 'page_size': '', 'content': ''}
    headers['User-Agent'] = baseCore.getRandomUserAgent()
    for i in range(0, 3):
        try:
            response = requests.get(pdf_url, headers=headers,verify=False, timeout=20)
            file_size = int(response.headers.get('Content-Length'))
            retData['content'] = response.text
            break
        except:
            time.sleep(3)
            continue

    name = str(getuuid()) + category
    try:
        result = getOBSres(pathType, name, response)
    except:
        log.error(f'OBS发送失败')
        return retData
    try:
        with fitz.open(stream=response.content, filetype='pdf') as doc:
            page_size = doc.page_count
    except:
        log.error(f'文件损坏')
        return retData
    if page_size < 1:
        # pdf解析失败
        # print(f'======pdf解析失败=====')
        return retData
    else:
        try:
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            retData['state'] = True
            retData['path'] = unquote(result['body']['objectUrl'].split('.com')[1])
            retData['full_path'] = unquote(result['body']['objectUrl'])
            retData['file_size'] = convert_size(file_size)
            retData['create_time'] = time_now
            retData['page_size'] = page_size
        except Exception as e:
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, f'{e}')
            return retData

        return retData

@retry(tries=3, delay=1)
def getOBSres(pathType,name, response):
    result = obsClient.putContent('zzsn', pathType + name, content=response.content)
    # resp = obsClient.putFile('zzsn', pathType + name, file_path='要上传的那个文件的本地路径')
    return result

def secrchATT(item_id, retData, type_id):
    sel_sql = '''select id from clb_sys_attachment where item_id = %s and path = %s and type_id=%s '''
    cursor_.execute(sel_sql, (item_id, retData['path'], type_id))
    selects = cursor_.fetchone()
    return selects

# 插入到att表 返回附件id
def tableUpdate(retData, com_name, year, pdf_name, num):
    item_id = retData['item_id']
    type_id = retData['type_id']
    group_name = retData['group_name']
    path = retData['path']
    full_path = retData['full_path']
    category = retData['category']
    file_size = retData['file_size']
    status = retData['status']
    create_by = retData['create_by']
    page_size = retData['page_size']
    create_time = retData['create_time']
    order_by = num
    # selects = secrchATT(item_id, pdf_name, type_id)
    #
    # if selects:
    #     log.info(f'pdf_name:{pdf_name}已存在')
    #     id = ''
    #     return id
    # else:
    try:
        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,page_size,object_key,bucket_name) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            year, pdf_name+category, type_id, item_id, group_name, path, full_path, category, file_size, order_by,
            status, create_by,
            create_time, page_size,full_path.split('https://zzsn.obs.cn-north-1.myhuaweicloud.com/')[1],'zzsn')
        cursor_.execute(Upsql, values)  # 插入
        cnx_.commit()  # 提交
    except Exception as e:
        log.info(e)
    log.info(f"更新完成:{item_id}===={pdf_name+category}")
    try:
        selects = secrchATT(item_id, retData, type_id)
    except Exception as e:
        log.info(e)
        return ''
    id = selects[0]
    return id


def InsterInto(social_code, pdf_url,pub_time,pdf_name):
    insert = False
    # 信息插入数据库
    try:
        insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,create_time) values(%s,%s,%s,%s,%s,%s,now())'''

        list_info = [
            social_code,
            pdf_url,
            '东方财富网',
            '1',
            pub_time,
            pdf_name
        ]
        #144数据库
        cursor.execute(insert_sql, tuple(list_info))
        cnx.commit()
        insert = True
        return insert
    except:
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, pdf_url, '数据库传输失败')
        return insert


def ifInstert(short_name, social_code, pdf_url):
    ifexist = True

    sel_sql = '''select social_credit_code,source_address from brpa_source_article where social_credit_code = %s and source_address = %s and origin='东方财富网' and type='1' '''
    cursor.execute(sel_sql, (social_code, pdf_url))
    selects = cursor.fetchone()
    #如果数据库中存在 则跳过
    if selects:
        ifexist = False
        log.info(f'com_name:{short_name}、{pdf_url}已存在')
        return ifexist
    else:
        return ifexist

def sendKafka(social_code,newsUrl,dic_news):

    try:
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], max_request_size=1024*1024*20)
        kafka_result = producer.send("researchReportTopic",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

        print(kafka_result.get(timeout=10))

        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        log.info(dic_result)
        return True
    except Exception as e:
        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, newsUrl, 'Kafka操作失败')
        log.info(dic_result)
        return False


def spider(browser, code, social_code, com_name):
    num = 0
    page_source = browser.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    now_page = int(soup.find_all('div', class_='mbox')[-1].find('span', class_='active').text)
    li_list = soup.find('div', class_='notice').find_all('li')
    log.info(f'----{com_name}--{code}--第{now_page}页开始处理-----')
    for li in li_list:
        publishDate = li.find('span').text
        year = publishDate[:4]
        newsUrl = 'https://np-info.eastmoney.com/pc/notice/?art_code=' + li.find('a')['data-code']
        title = li.find('a').text
        if ifInstert(com_name, social_code, newsUrl):
            pass
        else:
            continue

        time.sleep(1)
        browser2 = createDriver()
        browser2.get(newsUrl)
        wait = WebDriverWait(browser2, 30)
        wait.until(EC.presence_of_element_located((By.ID, "render-html")))
        page_source = browser2.page_source
        soup_news = BeautifulSoup(page_source, 'html.parser')
        contentWithTag = soup_news.find('div', id='render-html')
        content = contentWithTag.text
        if len(content) < 10:
            continue
        # 判断有无附件
        try:
            browser2.find_element(By.CLASS_NAME, 'download-list').click()
            time.sleep(0.5)
            browser2.switch_to.window(browser2.window_handles[-1])

            pdf_url = browser2.current_url
            # 上传到obs
            retData = uptoOBS(pdf_url, title, 8, social_code)
            # 附件插入att数据库
            if retData['state']:
                pass
            else:
                log.info(f'====pdf解析失败====')
                return False
            num = num + 1
            att_id = tableUpdate(retData, com_name, year, title, num)
            if att_id:
                pass
            else:
                return False
            # content = retData['content']
            # contentWithTag = ''
        except:
            att_id = ''

        browser2.quit()
        lang = baseCore.detect_language(content)
        if lang == 'cn':
            lang = 'zh'
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        dic_news = {
            'attachmentIds': att_id,
            'author': '',
            'content': content,
            'contentWithTag': str(contentWithTag),
            'createDate': time_now,
            'deleteFlag': '0',
            'id': '',
            'keyWords': '',
            'lang': lang,
            'origin': '东方财富网',
            'publishDate': publishDate,
            'sid': '1684032033495392257',
            'sourceAddress': newsUrl,  # 原文链接
            'summary': '',
            'title': title,
            'type': 3,
            'socialCreditCode': social_code,
            'year': year
        }
        if sendKafka(social_code,newsUrl,dic_news):
            log.info(f'---{com_name}---{code}---第{now_page}页----采集成功---{newsUrl}')
            insert = InsterInto(social_code, newsUrl, publishDate, title)
            if insert:
                log.info('====插入数据库成功====')
        else:
            log.info(f'失败---{title}----{att_id}---{social_code}')
            # 删除插入的数据 400表示发送数据失败
            baseCore.deliteATT(att_id)
            log.info(f'已删除插入附件表的数据---{title}-----{social_code}')
        time.sleep(1)
    # 翻页功能
    try:
        browser.find_element(By.CLASS_NAME, 'next').click()
        # continue
        return spider(browser, code, social_code, com_name)
    except:

        # span_tag = browser.find_element(By.CLASS_NAME,'mbox')
        span_tag = browser.find_element(By.XPATH, '//div[@class="mbox"]/span[2]')
        current_page = int(span_tag.text)
        totalpage = int(soup.find_all('div', class_='mbox')[-1].find_all('a')[-1].text)
        if current_page < totalpage:
            # 说明还未到最后一页
            span_tag.find_element(By.XPATH, './following-sibling::a[1]').click()

            return spider(browser, code, social_code, com_name)
        else:
            # 已经到最后一页
            return

def createDriver():
    chrome_driver = r'D:\cmd100\chromedriver.exe'
    path = Service(chrome_driver)
    chrome_options = webdriver.ChromeOptions()
    chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
    # 设置代理
    # proxy = "127.0.0.1:8080"  # 代理地址和端口
    # chrome_options.add_argument('--proxy-server=http://' + proxy)
    driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
    return driver

def gonggao_info(dic_info):
    # code = '00175.HK'
    code = dic_info[3]
    com_name = dic_info[1]
    social_code = dic_info[2]
    if 'HK' in code:
        pass
    else:
        return

    # 模拟浏览器

    url = f'https://emweb.securities.eastmoney.com/PC_HKF10/pages/home/index.html?code={code.split(".HK")[0]}&type=web&color=w#/CompanyNews'
    browser = createDriver()

    browser.get(url)
    time.sleep(1)
    try:
        spider(browser, code, social_code, com_name)
        return
    except Exception as e:
        log.info(f'error===={e}')
        return

if __name__ =='__main__':
    #从redis中读取social_code'

    list_c = []
    list_all_info_1 = []
    num = 0
    taskType = '企业公告/东方财富网'
    while True:
        start_time = time.time()
        # 获取企业信息
        # social_code = baseCore.redicPullData('NoticeEnterprise:ggqy_socialCode_add')
        social_code = '91330000747735638J'
        if not social_code:
            time.sleep(20)
            continue
        if social_code == 'None':
            time.sleep(20)
            continue
        if social_code == '':
            time.sleep(20)
            continue
        dic_info = baseCore.getInfomation(social_code)
        # count = dic_info[15]
        code = dic_info[3]
        com_name = dic_info[1]
        log.info(f'-----开始处理{com_name}----{social_code}------')
        try:
            gonggao_info(dic_info)
        except:
            log.info(f'-----error:{com_name}----{social_code}------')
        break




