import json
import time

import numpy as np
import pandas as pd
import requests
import urllib3
from bs4 import BeautifulSoup
from kafka import KafkaProducer
import sys
sys.path.append(r'D:\zzsn_spider\base')
import BaseCore
from retry import retry


urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

taskType = '企业基本信息/雅虎财经'

baseCore = BaseCore.BaseCore()
cursor = baseCore.cursor
cnx = baseCore.cnx
r = baseCore.r
log = baseCore.getLogger()
headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'cache-control': 'max-age=0',
    'sec-ch-ua': '"Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': "Windows",
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}


# 保存基本信息
def saveBaseInfo(info, xydm, gpdm,id_):
    url_ = info['base_info']['公司网站']
    add_ = info['base_info']['地址']
    company_dict = [id_,xydm, info['base_info']['英文名'], info['base_info']['电话'], url_, info['base_info']['公司简介'],
                    info['base_info']['行业'], add_, gpdm]

    return company_dict


# 获取请求响应
@retry(tries=5, delay=3)
def getRes(url):
    response = requests.get(url, headers=headers, verify=False)
    if response.status_code != 200:
        raise
    return response


# 根据股票代码 获取企业基本信息
def getInfo(gpdm, start):
    if 'HK' in str(gpdm):
        tmp_g = str(gpdm).split('.')[0]
        if len(tmp_g) == 5:
            gpdm_ = str(gpdm)[1:]
        else:
            gpdm_ = gpdm
    elif str(gpdm)[-2:] == '.N' or str(gpdm)[-2:] == '.O':
        gpdm_ = gpdm[:-2]
    else:
        gpdm_ = gpdm
    retData = {}
    url = f'https://finance.yahoo.com/quote/{gpdm_}/profile?p={gpdm_}'

    time.sleep(3)
    try:
        response = getRes(url)
    except:
        log.error(f"{gpdm}------访问基本信息页面失败")
        return retData,'访问基本信息页面失败'

    if 'lookup' in response.url:
        log.error(f"{gpdm}------股票代码未查询到信息：{response.status_code}")
        return retData,'股票代码未查询到信息'

    if url != response.url:
        log.error(f'{gpdm}------请求失败')
        return retData,'请求失败'

    state = 1
    soup = BeautifulSoup(response.content, 'html.parser')
    page = soup.find('div', {'id': 'Col1-0-Profile-Proxy'})

    if page.text == '' or 'Invalid Date data is not available' in page.text:
        state = 0
        log.error(f'{gpdm}---没有基本信息')
        return retData,'没有基本信息'

    try:
        name = page.find('h3', {'class': 'Fz(m) Mb(10px)'}).text.lstrip().strip()
    except:
        log.error(f'{gpdm}------其它错误原因')
        return retData,'其它错误原因'
    try:
        com_info = page.find('div', {'class': 'Mb(25px)'})
    except:
        com_info = ''
    try:
        com_phone = com_info.find_all('p')[0].find('a').text.lstrip().strip()
    except:
        com_phone = ''
    try:
        com_url = com_info.find_all('p')[0].find('a', {'target': '_blank'}).text.lstrip().strip()
    except:
        com_url = ''
    try:
        com_address = ''
        com_addressTag = com_info.find_all('p')[0]
        a_list = com_addressTag.select('a')
        for a in a_list:
            a.decompose()
        com_addressTag = str(com_addressTag).replace('<br/>', '</p><p>')
        com_addressTag = BeautifulSoup(com_addressTag, 'html.parser')
        p_list = com_addressTag.select('p')
        for p in p_list:
            com_address += p.text.lstrip().strip() + ' '
        com_address = com_address.lstrip().strip()
    except:
        com_address = ''
    try:
        com_bumen = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[0].text.lstrip().strip()
    except:
        com_bumen = ''
    try:
        com_hangye = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[1].text.lstrip().strip()
    except:
        com_hangye = ''
    try:
        com_people = com_info.find_all('p')[1].find_all('span', {'class': 'Fw(600)'})[2].text.lstrip().strip()
    except:
        com_people = ''
    try:
        com_jianjie = page.find('p', {'class': 'Mt(15px) Lh(1.6)'}).text.lstrip().strip()
    except:
        com_jianjie = ''
    dic_com_info = {
        '英文名': name,
        '股票代码': gpdm,
        '地址': com_address,
        '电话': com_phone,
        '公司网站': com_url,
        '部门': com_bumen,
        '行业': com_hangye,
        '员工人数': com_people,
        '公司简介': com_jianjie
    }
    retData['base_info'] = dic_com_info
    log.info(f"获取基本信息--{gpdm}，耗时{baseCore.getTimeCost(start, time.time())}")
    response.close()
    return retData,'成功'


# 采集工作
def beginWork():
    dic_list = []
    error_list1 = []
    error_list2 = []
    error_list3 = []
    writer_ = pd.ExcelWriter(r'D:\zzsn_spider\comData\未采集到企业基本信息_50001-55000.xlsx')
    writer = pd.ExcelWriter(r'D:\zzsn_spider\comData\企业基本信息_50001-55000.xlsx')
    df = pd.read_excel(r'D:\zzsn_spider\comData\雅虎财经上市企业信息采集50001-55000_20231215.xlsx',sheet_name='yahoostock')
    # xydm_list = df['信用代码']
    gpdm_list = df['symbol']
    id_list = df['id']
    for i in range(len(gpdm_list)):
        gpdm = gpdm_list[i]
        id_ = id_list[i]
        if not gpdm or gpdm == '':
            continue
        info,exc = getInfo(gpdm, time.time())
        if info:
            dic = saveBaseInfo(info, '', gpdm,id_)
            dic_list.append(dic)
        else:
            if exc == '股票代码未查询到信息':
                error_list1.append([id_,gpdm])
            elif exc == '没有基本信息':
                error_list2.append([id_,gpdm])
            elif exc == '其它错误原因':
                error_list3.append([id_,gpdm])

    df_ = pd.DataFrame(np.array(dic_list))
    df_.columns = ['id','信用代码','英文名','电话','官网','简介','行业','地址','股票代码']
    df_.to_excel(writer, index=False)
    writer.save()
    df_1 = pd.DataFrame(np.array(error_list1))
    df_2 = pd.DataFrame(np.array(error_list2))
    df_3 = pd.DataFrame(np.array(error_list3))
    df_1.columns = ['id','股票代码']
    df_2.columns = ['id','股票代码']
    df_3.columns = ['id','股票代码']
    df_1.to_excel(writer_, index=False,sheet_name='股票代码为查询到信息')
    df_2.to_excel(writer_, index=False,sheet_name='没有基本信息')
    df_3.to_excel(writer_, index=False,sheet_name='其它错误原因')
    writer_.save()
    # 释放资源
    baseCore.close()


if __name__ == '__main__':
    beginWork()
