# 雅虎财经企业动态获取
import json
import os
import signal
import time
import pymysql
from kafka import KafkaProducer
from selenium.webdriver.common.by import By
import sys
from retry import retry
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import traceback

sys.path.append('D:\\zzsn_spider\\base')
import BaseCore
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
r = baseCore.r

taskType = '企业动态/雅虎财经'
# smart =smart_extractor.SmartExtractor('cn')

last_url = ''


# 发送kafka
@retry(tries=3, delay=5)
def sendKafka(dic_news):
    producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
    kafka_result = producer.send("researchReportTopic",
                                 json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

    print(kafka_result.get(timeout=10))

    dic_result = {
        'success': 'ture',
        'message': '操作成功',
        'code': '200',
    }
    log.info(dic_result)


# 保存MySQL数据库
@retry(tries=3, delay=5)
def insertMysql(list_info):
    insert_sql = '''insert into brpa_source_article(social_credit_code,source_address,origin,type,publish_time,title,content,create_time) values(%s,%s,%s,%s,%s,%s,%s,now())'''
    cursor.execute(insert_sql, tuple(list_info))
    cnx.commit()


# 查重
def selectUrl(news_url, xydm):
    # with cnx.cursor() as cursor:
    sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
    cursor.execute(sel_sql, (news_url, xydm))
    selects = cursor.fetchall()
    return selects


# 获取最后一条动态url
def getLastUrl(driver):
    news_div = driver.find_element(By.ID, 'quoteNewsStream-0-Stream')
    news_lis = news_div.find_elements(By.XPATH, "./ul/li")
    last = len(news_lis)
    try:
        url = news_lis[last - 1].find_element(By.XPATH, "./div[1]/div[1]/div[2]/h3[1]/a").get_attribute(
            "href").lstrip().strip().replace("'", "''")
    except:
        url = news_lis[last - 1].find_element(By.XPATH, "./div[1]/div[1]/div[1]/h3[1]/a").get_attribute(
            "href").lstrip().strip().replace("'", "''")
    return url


# 拖动滑动条显示更多信息
def scroll(driver,xydm, name, gpdm):
    last_url_ = ''
    while True:
        js = "var q=document.documentElement.scrollTop=100000"
        driver.execute_script(js)
        time.sleep(1)
        try:
            last_url = getLastUrl(driver)
        except Exception as e:
            log.error(f"{name}--{gpdm}--获取不到最后一条链接")
            break
        # todo:增量时 需打开注释
        try:
            selects = selectUrl(last_url_, xydm)
        except:
            break
        if selects:
            break

        if last_url_ == last_url:
            break
        last_url_ = last_url
        time.sleep(1)


# 采集失败的企业 重新放入redis
def rePutIntoR(item):
    r.rpush('NewsEnterprise:gwqy_socialCode', item)




# 获取资讯详情
@retry(tries=5,delay=5)
def getZx(driver,xydm, url, title, origin):
    start_time_content = time.time()
    driver.get(url)
    try:
        clickButton = driver.find_element(By.CLASS_NAME, "collapse-button")
        clickButton.click()
        time.sleep(0.5)
        driver.execute_script("arguments[0].remove()",clickButton)
        time.sleep(0.5)
    except Exception as e:
        pass
    time.sleep(0.5)

    authorElement = driver.find_element(By.CLASS_NAME, "caas-author-byline-collapse")

    timeElement = driver.find_element(By.CLASS_NAME, "caas-attr-time-style").find_element(By.TAG_NAME,
                                                                                          "time")

    contentWithTag = driver.find_element(By.CLASS_NAME, "caas-body").get_attribute('outerHTML')

    author = authorElement.text.lstrip().strip().replace("'", "''")

    pub_time = timeElement.get_attribute("datetime").lstrip().strip().replace("'", "''").replace("T", " ")
    pub_time = pub_time[0:19]
    #if pub_time < '2023-09-01':
    #    return '超过截止日期'
    content = driver.find_element(By.CLASS_NAME, "caas-body").text
    if len(content) < 400:
        exception = ''
        return exception
    contentWithTag = contentWithTag.replace("'", "''")

    # 动态信息列表
    list_info = [
        xydm,
        url,
        f'雅虎财经-{origin}',
        '2',
        pub_time,
        title,
        contentWithTag[0:500]
    ]

    log.info(f"文章耗时，耗时{baseCore.getTimeCost(start_time_content, time.time())}")
    try:
        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        # todo:插入一条数据，并传入kafka
        dic_news = {
            'attachmentIds': '',
            'author': author,
            'content': content,
            'contentWithTag': contentWithTag,
            'createDate': time_now,
            'deleteFlag': '0',
            'id': '',
            'keyWords': '',
            'lang': 'en',
            'origin': origin,
            'publishDate': pub_time,
            'sid': '1714853151160340481',
            'sourceAddress': url,  # 原文链接
            'summary': '',
            'title': title,
            'type': 2,
            'socialCreditCode': social_code,
            'year': pub_time[:4]
        }
        # print(dic_news)
        # 将相应字段通过kafka传输保存
        try:
            sendKafka(dic_news)
            try:
                insertMysql(list_info)
            except Exception as e1:
                log.error("保存数据库失败")
                exception = '数据库传输失败'
                return exception
            # 传输成功,写入日志中
            exception = ''
            return exception
            # return True
        except Exception as e:
            dic_result = {
                'success': 'false',
                'message': '操作失败',
                'code': '204',
                'e': e
            }
            log.error(dic_result)
            exception = 'Kafka操作失败'
            return exception
    except Exception as e:
        log.info(f'传输失败:{social_code}----{url}')
        exception = '数据id获取失败'
        return exception


if __name__ == "__main__":
    StartTime = time.time()
    path = r'D:\zzsn_spider\comData\cmd6\chromedriver.exe'
    driver = baseCore.buildDriver(path, headless=True)
    cnx = baseCore.cnx
    cursor = baseCore.cursor
    ErrorNume = 0

    while True:

        if ErrorNume >= 15:
            driver.quit()
            break

        # 根据从Redis中拿到的社会信用代码,在数据库中获取对应基本信息
        social_code = baseCore.redicPullData('NewsEnterprise:gwqy_socialCode')
        # social_code = 'ZZSN22080900000046'

        # 判断 如果Redis中已经没有数据，则等待
        if not social_code:
            log.info('============已没有数据============等待===============')
            time.sleep(1800)
            continue
        if social_code == None:
            time.sleep(1800)
            continue
        try:
            data = baseCore.getInfomation(social_code)
        except:
            rePutIntoR(social_code)
            driver.quit()
            break
        name = data[1]
        enname = data[5]
        gpdm = data[3]
        if 'HK' in str(gpdm):
            tmp_g = str(gpdm).split('.')[0]
            if len(tmp_g) == 5:
                gpdm = str(gpdm)[1:]
            else:
                pass
        elif str(gpdm)[-2:] == '.N' or str(gpdm)[-2:] == '.O':
            gpdm = gpdm[:-2]
        xydm = data[2]

        # 获取该企业对应项目的采集次数
        count = data[18]
        start_time = time.time()
        if (gpdm == ''):
            log.error(f"{name}--股票代码为空 跳过")
            exception = '股票代码为空'
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(xydm, taskType, state, takeTime, '', exception)
            continue
        try:
            url = f"https://finance.yahoo.com/quote/{gpdm}/?p={gpdm}"
            driver.get(url)
            if '用户将无法从中国大陆使用 Yahoo 的产品与服务' in driver.page_source:
                log.error('代理失效')
                time.sleep(5)
                #driver.quit()
                #driver = baseCore.buildDriver(path, headless=True)
                rePutIntoR(social_code)
                ErrorNume += 1
                continue
            if 'https://consent.yahoo.com/v2/collectConsent' in driver.current_url:
                log.error('页面跳转，出现弹窗')
                time.sleep(5)
                #driver.quit()
                #driver = baseCore.buildDriver(path, headless=True)
                rePutIntoR(social_code)
                ErrorNume += 1
                continue
            if 'lookup' in driver.current_url:
                log.error(f"{name}--{gpdm}--股票代码错误")
                exception = '股票代码错误'
                state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(xydm, taskType, state, takeTime, driver.current_url, exception)
                continue
            try:
                WebDriverWait(driver, 15).until(EC.visibility_of_element_located((By.ID, 'quoteNewsStream-0-Stream')))
                div_flg = driver.find_element(By.ID, 'quoteNewsStream-0-Stream')
                if "We're sorry we weren't able to find anything about this topic." in div_flg.text:
                    log.error(f"{driver.current_url}")
                    log.error(f"{name}--{gpdm}--没找到新闻元素")
                    exception = '没找到新闻元素'
                    state = 0
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog(xydm, taskType, state, takeTime, url, exception)
                    continue
            except Exception as e:
                log.error(f"{name}--{gpdm}--页面打开失败")
                time.sleep(5)
                #driver.quit()
                #driver = baseCore.buildDriver(path, headless=True)
                rePutIntoR(social_code)
                ErrorNume += 1
                continue
            try:
                scroll(driver,xydm, name, gpdm)
            except Exception as e:
                log.error(f"{name}--{gpdm}--拖拽出现问题")
            news_div = driver.find_element(By.ID, 'quoteNewsStream-0-Stream')
            news_lis = news_div.find_elements(By.XPATH, "./ul/li")
            log.info(f"{name}--{gpdm}--{len(news_lis)}条信息")

            # 标识符 判断脚本是否断开连接
            flag = 0
            news_info_list = []
            # 获取咨询url、title、origin并放入列表news_info_list中
            for i in range(0, len(news_lis)):
                try:
                    try:
                        a_ele = news_lis[i].find_element(By.XPATH, "./div[1]/div[1]/div[2]/h3[1]/a")
                    except:
                        a_ele = news_lis[i].find_element(By.XPATH, "./div[1]/div[1]/div[1]/h3[1]/a")
                except Exception as e:
                    if news_lis[i].is_displayed():
                        log.error(f"{name}--{gpdm}--{i}----a标签没找到")
                        exception = 'a标签没找到'
                        state = 0
                        takeTime = baseCore.getTimeCost(start_time, time.time())
                        baseCore.recordLog(xydm, taskType, state, takeTime, url, exception)
                        continue
                    else:
                        log.error(f"{name}--{gpdm}--{i}----与网站断开连接")
                        time.sleep(5)
                        driver.quit()
                        driver = baseCore.buildDriver(path, headless=True)
                        rePutIntoR(social_code)
                        ErrorNume += 1
                        flag = 1
                        break
                news_url = a_ele.get_attribute("href").lstrip().strip().replace("'", "''")
                if (news_url.startswith("https://finance.yahoo.com")):
                    title = a_ele.text.lstrip().strip().replace("'", "''")
                    try:
                        origin = news_lis[i].find_element(By.XPATH, './div/div/div[2]/div/span[1]').text
                    except:
                        origin = news_lis[i].find_element(By.XPATH, './div/div/div/div/span[1]').text
                    if origin == '':
                        log.error('来源获取失败')
                        continue
                    news_info_list.append([news_url,title,origin])
                else:
                    continue
            if flag == 1:
                continue
            for i in range(len(news_info_list)):
                news_url = news_info_list[i][0]
                title = news_info_list[i][1]
                origin = news_info_list[i][2]
                # 判断url是否已经存在
                sel_sql = '''select social_credit_code from brpa_source_article where source_address = %s and social_credit_code=%s '''
                cursor.execute(sel_sql, (news_url, xydm))
                selects = cursor.fetchall()
                if selects:
                    log.info(f"{name}--{gpdm}--网址已经存在----{news_url}")
                    exception = '网址已存在'
                    state = 0
                    takeTime = baseCore.getTimeCost(start_time, time.time())
                    baseCore.recordLog(xydm, taskType, state, takeTime, news_url, exception)
                    # 增量使用
                    break
                    # 全量使用
                    #continue
                try:
                    exception = getZx(driver,xydm, news_url, title, origin)
                except:
                    log.error('获取正文失败')
                    driver.quit()
                    driver = baseCore.buildDriver(path, headless=True)
                    exception = '获取正文失败'
                #if exception == '超过截止日期':
                #    log.info(f'{name}--{gpdm}--九月一日前数据以采集完毕')
                #    break
                if exception == '':
                    state = 1
                else:
                    state = 0
                takeTime = baseCore.getTimeCost(start_time, time.time())
                baseCore.recordLog(xydm, taskType, state, takeTime, news_url, exception)
                log.info(f"{name}--{gpdm}--{i}----{news_url}")
            log.info(f"{name}--{gpdm}--企业整体，耗时{baseCore.getTimeCost(start_time, time.time())}")

            # 信息采集完成后将该企业的采集次数更新
            runType = 'NewsRunCount'
            count += 1
            baseCore.updateRun(social_code, runType, count)
            ErrorNume = 0

        except Exception as e:
            rePutIntoR(xydm)
            state = 0
            takeTime = baseCore.getTimeCost(start_time, time.time())
            baseCore.recordLog(xydm, taskType, state, takeTime, '', '远程主机强迫关闭了一个现有的连接。')
            log.info(f"-------{name}--{gpdm}---'远程主机强迫关闭了一个现有的连接。'--------")
            log.info('===========连接已被关闭========等待重新连接===========')
            driver.quit()
            driver = baseCore.buildDriver(path, headless=True)
            time.sleep(5)
            ErrorNume += 1
            continue


        EndTime = time.time()
        if EndTime - StartTime >= 6 * 60 * 60 :
            driver.quit()
            log.info('程序执行超过6小时，重启')
            break

    cursor.close()
    cnx.close()
    # 释放资源
    baseCore.close()
