# -*- coding: utf-8 -*-
import json

import openpyxl
import re
import time

import pandas as pd
import requests
from bs4 import BeautifulSoup

from kafka import KafkaProducer

from base.BaseCore import BaseCore
baseCore = BaseCore()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
log = baseCore.getLogger()

import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

from openpyxl import Workbook, load_workbook


#创建文件
def createFile(file_name):
    wb = Workbook()
    sheet = wb.active
    # 更改默认的sheet名称
    sheet.title = "需处理企业"
    sheet.append(["企业名称", "社会信用代码"])
    # 创建另一个sheet
    sheet2 = wb.create_sheet("获取基本信息成功企业")
    sheet2.append(["企业名称", "社会信用代码", "采到的信用代码"])
    wb.save(file_name)
    wb.close()

#追加数据
def appenddata(file_name,sheet,data):
    # 打开现有的Excel文件
    wb = load_workbook(file_name)

    # 选择要追加数据的sheet
    sheet = wb[sheet]
    sheet.append(data)
    # 保存Excel文件
    wb.save(file_name)
    wb.close()

def sendkafka(post_data):
    try:
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'], api_version=(2, 0, 2))
        kafka_result = producer.send("regionInfo", json.dumps(post_data, ensure_ascii=False).encode('utf8'))
        print(kafka_result.get(timeout=10))
    except:
        exception = 'kafka传输失败'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())
        baseCore.recordLog(social_code, taskType, state, takeTime, '', exception)
        log.info(f"{com_name}--{social_code}--kafka传输失败")

def deletep(soup,tag_,attribute_to_delete,value_to_delete):
    if attribute_to_delete and value_to_delete:
        # 查找带有指定属性的P标签并删除
        tags = soup.find_all(tag_, {attribute_to_delete: value_to_delete})
        for tag in tags:
            # print(tag)
            tag.decompose()
    else:
        tags = soup.find_all(tag_)
        for tag in tags:
            # print(tag)
            tag.decompose()

def deletek(soup):
    # 删除空白标签（例如<p></p>、<p><br></p>, img、video、hr除外）
    for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' 'or tag.get_text()==' '):
        for j in i.descendants:
            if j.name in ["img", "video", "br"]:
                break
        else:
            i.decompose()

def deletespan(td):
    spans = td.find_all('span', class_='app-copy copy-button-item')
    for span in spans:
        if '复制' in span.text:
            span.extract()  # 删除span标签

    spans2 = td.find_all('span', slot='content')
    for span2 in spans2:
        if '趋势图' in span2.text:
            span2.extract()
    spans3 = td.find_all('span', class_='m-l-r-10')
    for span3 in spans3:
        if '年报' in span3.text:
            span3.extract()

def getinfo(dict1,dict2):
    # 取出两个字典的key值集合
    keys1 = set(dict1.keys())
    keys2 = set(dict2.keys())

    # 取出并集
    union_keys = keys1 | keys2
    # 根据并集的key值，从两个字典中取出value值，组成新的字典
    result_dict = {key: dict1.get(key, None) or dict2.get(key, None) for key in union_keys}
    return result_dict

def baseinfo(com_soup):
    baseinfo = com_soup.find('div', class_='contact-info')
    cominfo_list = baseinfo.find_all('span', class_='f')
    data = {}
    for cominfo in cominfo_list:
        # print(cominfo)
        value = cominfo.find('span', class_='val').text.replace('复制', '').strip(' ')
        pattern = r'\（\d{4}\s*年\）'
        match = re.search(pattern, value)
        if match:
            # print(match.group(0))
            value = value.split(match.group(0))[0]
        # print(value)

        deletep(cominfo, 'span', 'class', 'val')
        deletep(cominfo, 'a', '', '')
        deletek(cominfo)
        # print(cominfo)

        name = cominfo.text.replace('\n', '').replace('复制', '').strip(' ').replace('：', '')
        # print(name,value)
        data[name] = value
    return data

def checklogin(key):

    # url = f'https://www.qcc.com/web/search?key=91110108558521630L'
    url = f'https://www.qcc.com/web/search?key={key}'
    req = requests.get(headers=headers, url=url)
    soup = BeautifulSoup(req.content, 'html.parser')
    if soup.find('title').text == '会员登录 - 企查查':
        log.info('状态---未登录')
        soup = ''
        return soup
    return soup

def redaytowork(com_name,social_code):
    if social_code:
        dic_info = baseCore.getInfomation(social_code)
    elif not social_code:
        dic_info = baseCore.getBYnameInfomation(com_name)
    else:
        dic_info = ''
    if dic_info:
        pass
    log.info(f'----当前企业{social_code}-{com_name}--开始处理---')
    count = dic_info[14]
    # 企查查id
    company_id = dic_info[12]
    # 如果没有信用代码 就通过名字搜索 如果有信用代码 就通过信用代码
    if social_code:
        soup = checklogin(social_code)
    else:
        soup = checklogin(com_name)
    if not soup:
        log.info("登录失效===重新放入redis")
        baseCore.rePutIntoR('BaseInfoEnterprise:gnqy_socialCode', company_field)
        # baseCore.delete_token(token)
        log.info('=====已重新放入redis,失效token已删除======')
        time.sleep(20)
        return count
    else:
        searchinfo = soup.find_all('div', class_='npanel-heading')[1].find('span', class_='text-danger').text
        if searchinfo == '0':
            log.info('=====搜索不到该企业====')
            data = [com_name, social_code]
            # todo:搜不到的企业需要返回到一个表格中
            appenddata(file_name, '需处理企业', data)
            return count
        else:
            # 开始采集
            try:
                spiderwork(soup, com_name)
                count += 1
                log.info(f'采集{com_name}成功=======耗时{baseCore.getTimeCost(start_time, time.time())}')
                return count
            except:
                log.info(f'====={social_code}=====获取基本信息失败，重新放入redis=====')
                baseCore.rePutIntoR('BaseInfoEnterprise:gnqy_social_code', social_code)
                # baseCore.delete_token(token)
                log.info('=====已重新放入redis,失效token已删除======')
                return count


def spiderwork(soup,receptname):
    company_url = ''
    company_list = soup.find('table',class_='app-ltable ntable ntable-list ntable ntable-list')
    tr_list = company_list.find_all('tr',class_='tsd0')

    # receptname = '小米通讯技术有限公司'
    for tr in tr_list:
        info_t = tr.find('span',class_='copy-title')
        getname = info_t.find('span').text
        log.info(f'接收到的企业名称--{com_name}---采到的企业名称--{getname}')
        if getname == receptname:
            company_url = info_t.find('a')['href']
            break
        else:
            continue
    if company_url:
        # company_url = 'https://www.qcc.com/firm/80af5085726bb6b9c7770f1e4d0580f4.html'
        # company_url = 'https://www.qcc.com/firm/50f75e8a8859e609ec37976f8abe827d.html'
        qccid = company_url.split('firm/')[1].split('.html')[0]
        #将采集到的企查查id更新
        updateSql = f"update EnterpriseInfo set QCCID = '{qccid}' where SocialCode = '{social_code}'"
        cursor_.execute(updateSql)
        cnx_.commit()
        req_ = requests.get(headers=headers,url=company_url)
        com_soup = BeautifulSoup(req_.content,'html.parser')
        try:
            businessinfo = com_soup.find('div', class_='cominfo-normal')
        except:
            businessinfo = ''
        if businessinfo:
            data_businfo = {}
            data_baseinfo = baseinfo(com_soup)
            # print(data_baseinfo)
            try:
                name = businessinfo.find('div', class_='ntag text-gray original-tag').text
                value = businessinfo.find('div', class_='original-name-list').text.replace('展开', '').replace(' ', '').replace('…', '').replace('\n', '').replace('复制', '')
            except:
                name = '曾用名'
                value = ''
            data_businfo[name] = value
            td_tags = businessinfo.find_all('td')
            # print(td_tags)

            for td in td_tags:

                if 'class' in td.attrs and 'tb' in td['class']:
                    div_tags = td.find_all('div')
                    texts = [div.text for div in div_tags]
                    if len(texts) > 0:
                        for text in texts[::-1]:
                            data_businfo[text.replace('复制', '').replace('\n', '').strip(' ')] = None
                    else:
                        data_businfo[td.text.replace('复制', '').replace('\n', '').strip(' ')] = None
                else:
                    # 没有class='tb'属性的标签
                    att_list = ['inline-block', 'ntag-v2', 'm-l-r-10', 'm-l-sm']
                    for att in att_list:
                        deletep(td, 'a', 'class', att)
                    deletek(td)
                    deletep(td,'div','class','text-gray clearfix original-name-part')
                    deletespan(td)
                    # if len(result_dict) <= len(td_tags) // 2:
                    div_tags = td.find_all('div')
                    texts = [div.text for div in div_tags if len(div.attrs) == 0]
                    if len(texts) > 0:
                        i = 1
                        for text in texts:
                            if text == ' ':
                                continue
                            data_businfo[list(data_businfo.keys())[-i]] = text.replace('复制', '').replace('\n', '').replace(' ','')
                            i += 1
                    else:
                        if '实缴资本' in td.text:
                            # pattern = r"\d+万美元"
                            # match = re.search(pattern, td.text.replace('复制', '').replace('\n', '').replace(' ', ''))
                            # if match:
                            #     value = match.group()
                            value = td.text.replace('复制', '').replace('\n', '').replace(' ', '').split('实缴资本')[0]
                            data_businfo[list(data_businfo.keys())[-1]] = value

                        else:
                            data_businfo[list(data_businfo.keys())[-1]] = td.text.replace('复制', '').replace('\n', '').replace(' ', '')

            result_dict = getinfo(data_businfo,data_baseinfo)

            print(result_dict)
            #采集成功的企业
            data = [com_name,social_code,result_dict['统一社会信用代码']]
            appenddata(file_name,'获取基本信息成功企业',data)
            # sendkafka(result_dict)


        else:
            data_baseinfo = baseinfo(com_soup)
    else:
        #没有搜到相同的企业名称
        data = [com_name, social_code]
        appenddata(file_name, '需处理企业',data)

if __name__ == '__main__':
    taskType = '基本信息/企查查'

    # 从redis里拿数据
    nowtime = baseCore.getNowTime(1).replace('-', '')[:8]
    file_name = f'./data/企业基本信息采集情况_{nowtime}.xlsx'
    createFile(file_name)
    while True:
        # TODO:需要隔两个小时左右抓包修改,token从数据库中获得
        # token = baseCore.GetToken()
        # if token:
        #     pass
        # else:
        #     log.info('==========已无token==========')
        #     time.sleep(30)
        #     continue
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Cookie': 'qcc_did=046d99c9-566e-4046-9094-689901b79748; UM_distinctid=18aac5b8c21810-046f8431aecf58-26031f51-1fa400-18aac5b8c22efd; CNZZDATA1254842228=109635008-1695108795-https%253A%252F%252Fwww.qcc.com%252F%7C1695113473; _uab_collina=169935323766710839405007; QCCSESSID=4e595fd804c28ae43780e55183; acw_tc=7522281e16999324472113552e97729806c88361a71c9bc96f8d5ff1c0',
            'Host': 'www.qcc.com',
            'Referer': 'https://www.qcc.com/',
            'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
            'Sec-Ch-Ua-Mobile': '?0',
            'Sec-Ch-Ua-Platform': '"Windows"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
        }
        start_time = time.time()
        # 获取企业信息
        # company_field = baseCore.redicPullData('BaseInfoEnterprise:gnqy_socialCode')
        company_field = '小米通讯技术有限公司|91110108558521630L'
        if company_field == 'end':
            # 本轮处理完毕，需要发送邮件，并且进入下一轮
            baseCore.sendEmail(file_name)
            time.sleep(20)

            #创建下一轮的文件
            nowtime = baseCore.getNowTime(1).replace('-', '')[:10]
            file_name = f'./企业基本信息采集情况_{nowtime}.xlsx'
            createFile(file_name)
            continue

        if company_field == '' or company_field is None:
            # 本轮结束后没有新增的企业要采集
            time.sleep(20)
            continue

        com_name = company_field.split('|')[0]
        social_code = company_field.split('|')[1]

        count = redaytowork(com_name,social_code)
        # 信息采集完成后将该企业的采集次数更新
        runType = 'BaseInfoRunCount'
        baseCore.updateRun(social_code, runType, count)