"""
证监会企业名单
"""
import time
import random
import requests
from bs4 import BeautifulSoup
from retry import retry
from base import BaseCore
from obs import ObsClient

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

cnx = baseCore.cnx
cursor = baseCore.cursor

cnx_ = baseCore.cnx_
cursor_ = baseCore.cursor_

taskType = '企业名单/证监会'

def createDriver():
    chrome_driver = r'D:\cmd100\chromedriver.exe'
    path = Service(chrome_driver)
    chrome_options = webdriver.ChromeOptions()
    chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--ignore-certificate-errors')
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument('user-agent='+'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    chrome_options.add_argument('--headless')
    # 设置代理
    # proxy = "127.0.0.1:8080"  # 代理地址和端口
    # chrome_options.add_argument('--proxy-server=http://' + proxy)
    driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
    return driver

@retry(tries=3, delay=5)
def RequestUrl(url):
    # ip = baseCore.get_proxy()
    # proxy = {'https': 'http://127.0.0.1:8888', 'http': 'http://127.0.0.1:8888'}
    response = requests.get(url=url, headers=headers)
    response.encoding = response.apparent_encoding
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'lxml')
        return soup
    else:
        raise

def browserRequest(url):
    browser = createDriver()
    browser.get(url)
    wait = WebDriverWait(browser, 30)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "m-table2")))
    page_source = browser.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    return soup

def getUrl(url_parm):
    # 深市
    # 沪市

    url = f'http://eid.csrc.gov.cn/{url_parm}/index_f.html'

    # 北交所

    return url

# 映射关系
def getmap(dic_info):
    data_dic = {
        '公司全称': 'company_name',
        '公司简称': 'short_name',
        '英文名称': 'english_name',
        '股票代码': 'gpdm',
        '上市板块': 'shbk',
        '股票类型': 'gp_type',
        '法定代表人': 'coreperson',
        '注册地址': 'address',
        '行业种类': 'hy_type',
        '公司网址': 'website',
        '上市时间': 'ipotime',
        '邮政编码': 'emial_code',
        '公司电话': 'phone',
    }
    dict3 = {value: dic_info.get(key, '') for key, value in data_dic.items()}
    print(dict3)
    return dict3
    # for key1,value1 in data_dic:
    #     for key2 in dic_info.keys():
    #         if key2 == key1:
    #             dic_info[data_dic[key1]] = dic_info[key2]
    #             del dic_info[key2]
    #             break
    #         else:
    #             dic_info[data_dic[key1]] = ''
    #             continue
    # print(data_dic)

# 采集信息
def SpiderByZJH(url, start_time):  # dic_info 数据库中获取到的基本信息

    try:
        soup = RequestUrl(url)
    except:
        # 请求失败，输出错误信息
        log.error(f'请求失败:{url}')
        #重新放入redis
        time.sleep(random.randint(60, 120))
        soup = ''
    if soup == '':
        return
    # 判断查找内容是否存在
    # try:
    #     is_exist = soup.find('div',class_='con').text
    #     if is_exist == '没有查询到数据':
    #         state = 0
    #         takeTime = baseCore.getTimeCost(start_time, time.time())
    #         baseCore.recordLog(social_code, taskType, state, takeTime, url, '没有查询到数据')
    #         return
    # except:
    #     pass

    # 先获取页数
    page = soup.find('div', class_='pages').find_all('li')[-1]
    total = page.find('b').text

    for i in range(1,int(total)+1):
        log.info(f'==========正在采集第{i}页=========')
        if i == 1:
            href = url
        else:
            # http://eid.csrc.gov.cn/101811/index_3_f.html
            href = url.split('index')[0] + f'index_{i}.html'
        try:
            soup = browserRequest(href)
        except:
            # 请求失败，输出错误信息
            log.error(f'请求失败:{url}')
            # 重新放入redis

        tr_list1 = soup.find('table', class_='m-table2')
        # print(tr_list1)
        tr_list = tr_list1.find_all('tr')
        # pageIndex = 0
        for tr in tr_list[1:]:
            dic_info = {}
            # pageIndex += 1
            td_list = tr.find_all('td')
            gpdm = td_list[0].text
            short_name = td_list[1].text
            companyname = td_list[2].text
            shbk = td_list[3].text.replace(' ', '').replace('\r', '').replace('\n', '')  #上市板块
            # print(pdf_url)
            selectSql = f"select count(1) from ipo_enterprise_list where gpdm='{gpdm}' and company_name='{companyname}'"
            cursor.execute(selectSql)
            count = cursor.fetchone()[0]
            if count > 0:
                log.info(f"{gpdm}-------{companyname}---已经存在")
                continue
            else:
                dic_info = {
                    '公司全称': companyname,
                    '公司简称': short_name,
                    '股票代码': gpdm,
                    # '上市板块': shbk
                }
                info_url = 'http://eid.csrc.gov.cn/' + td_list[0].find('a')['href']
                soup_info = RequestUrl(info_url)
                try:
                    info_list = soup_info.find('table',class_='m-table3').find_all('tr')[1:]
                except Exception as e:
                    log.info(f'error---{e}---第{i}页--{info_url}')
                    info_list = []

                for tr_ in info_list:
                    td_list = tr_.find_all('td')
                    for td in td_list:
                        value = td.find('span').text.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ', '')
                        span_tag = td.find('span')
                        span_tag.decompose()
                        name = td.text.replace('：', '')
                        dic_info[name] = value
                # 插入数据库
                final_dic = getmap(dic_info)
                values_tuple = tuple(final_dic.values())
                # log.info(f"{gpdm}-------{companyname}---新增")
                insertSql = f"insert into ipo_enterprise_list(company_name,short_name,english_name,gpdm,shbk,gp_type,coreperson,address,hy_type,website,ipotime,emial_code,phone) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                cursor.execute(insertSql,values_tuple)
                cnx.commit()
                log.info(f"{gpdm}-------{companyname}---新增")
        log.info(f"【{i}/{total}】-----------end,耗时{baseCore.getTimeCost(start_time, time.time())}")



if __name__ == '__main__':
    num = 0
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        # 'Cookie': 'yfx_c_g_u_id_10008998=_ck23112014074614515077233960865; yfx_f_l_v_t_10008998=f_t_1700460466453__r_t_1700460466453__v_t_1700460466453__r_c_0; yfx_mr_10008998=%3A%3Amarket_type_free_search%3A%3A%3A%3Abaidu%3A%3A%3A%3A%3A%3A%3A%3Awww.baidu.com%3A%3A%3A%3Apmf_from_free_search; yfx_mr_f_10008998=%3A%3Amarket_type_free_search%3A%3A%3A%3Abaidu%3A%3A%3A%3A%3A%3A%3A%3Awww.baidu.com%3A%3A%3A%3Apmf_from_free_search; yfx_key_10008998=; _yfx_session_10008998=%7B%22_yfx_firsttime%22%3A%221701508120899%22%2C%22_yfx_lasttime%22%3A%221701508120899%22%2C%22_yfx_visittime%22%3A%221701508120899%22%2C%22_yfx_domidgroup%22%3A%221701508120899%22%2C%22_yfx_domallsize%22%3A%22100%22%2C%22_yfx_cookie%22%3A%2220231202170840906620987838830281%22%7D; acw_tc=01c604a717025467485993784e5c9f1847d885d2c82ee192efdfd627ba',
        'Host': 'eid.csrc.gov.cn',
        'If-Modified-Since': 'Thu, 14 Dec 2023 08:06:01 GMT',
        'If-None-Match': '"657ab769-95b5"',
        # 'Referer': 'http://eid.csrc.gov.cn/201010/index_3.html',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }


    dic_parms = {}
    # 读取数据库获取股票代码 简称 以及 社会信用代码
    while True:
        start_time = time.time()
        # 沪市http://eid.csrc.gov.cn/101111/index.html 深市http://eid.csrc.gov.cn/101811/index.html 北交所http://eid.csrc.gov.cn/102611/index.html
        # url 变量 翻页 栏目 http://eid.csrc.gov.cn/101811/index_3_f.html

        # 沪市主板 沪市科创板
        # url_parms = ['201010', '201014']
        # url_parms = ['201011', '201013']
        url_parms = ['202610']
        for url_parm in url_parms:
            url = getUrl(url_parm)

            start_time_cj = time.time()
            log.info(f'======开始处理======')
            SpiderByZJH(url, start_time)
        break
    cursor.close()
    cnx.close()
    baseCore.close()
