import os
import time

from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium import webdriver
from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 北京
def bei_jing():
    num = 0
    start_time = time.time()
    # 有反爬需要使用selenium
    # service = Service(r'D:/chrome/113/chromedriver.exe')
    # 配置selenium
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_experimental_option(
        "excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
    chrome_options.add_argument('log-level=3')
    chrome_options.add_argument(
        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
    # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
    chrome_options.binary_location = r'D:\Google\Chrome\Application\chrome.exe'
    chromedriver = r'D:\cmd100\chromedriver.exe'
    # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
    bro = webdriver.Chrome(options=chrome_options, executable_path=chromedriver)
    # with open('../../base/stealth.min.js') as f:
    #     js = f.read()
    #
    # bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    #     "source": js
    # })
    url = 'http://gzw.beijing.gov.cn/xxfb/zcfg/index.html'
    hrefs = []
    try:
        bro.get(url)
        time.sleep(2)
        bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        time.sleep(1)
        while True:
            # 获取所有要爬取页面的url
            ul = bro.find_element(By.CLASS_NAME, 'public_list_team')
            li_list = ul.find_elements(By.TAG_NAME, 'li')
            for li in li_list:
                href_ = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
                title_ = li.find_element(By.TAG_NAME, 'a').get_attribute('title')
                hrefs.append([href_, title_])
            updown = bro.find_element(By.CLASS_NAME, 'fanye').find_elements(By.TAG_NAME, 'a')[-1]
            if updown.get_attribute('title') != '下一页':
                break
            updown.click()
            time.sleep(2)
        log.info(f'------{len(hrefs)}条数据-------------')
        num = 0
        count = 0
        for href in hrefs:
            id_list = []
            title = href[1]
            # todo:测试需要 注释掉判重
            # 判断是否已经爬取过
            is_href = baseTool.db_storage.find_one({'网址': href[0]})
            if is_href:
                num += 1
                log.info('已采集----------跳过')
                continue
            # 对获取信息页面发送请求
            bro.get(href[0])
            time.sleep(1)
            # 获取所要信息
            pub = bro.find_element(By.CLASS_NAME, 'doc-info')
            topic = str(pub.text).split('[主题分类] ')[1].split('\n')[0].strip()
            # 发文机构
            organ = str(pub.text).split('[发文机构] ')[1].split('\n')[0].strip()
            pub_time = str(pub.text).split('[发布日期] ')[1].split('[有效性] ')[0].strip().lstrip()
            writtenDate = str(pub.text).split('[成文日期] ')[1].split('\n')[0].strip()
            # pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip()
            pub_hao = pub.find_element(By.CLASS_NAME, 'fwzh').text.replace('[发文字号] ', '').lstrip().strip()
            try:
                pub_list = bro.find_elements(By.CLASS_NAME, 'article-info')
                for source in pub_list:
                    if '来源' in source.text:
                        pub_source = source.text.split('来源：')[1].split('\n')[0]
                        # print(pub_source)
            except:
                pub_source = ''
            # .split('来源：')[1]
            if '号' not in pub_hao:
                pub_hao = ''
            cont = bro.find_element(By.ID, 'div_zhengwen').get_attribute('innerHTML')
            soup_cont = BeautifulSoup(cont, 'lxml')

            soup = baseTool.paserUrl(soup_cont, href[0])
            soup.prettify()
            if soup.text == '' or soup.text == 'None':
                log.info(f'----{href[0]}----{title}----内容为空----')
                continue
            # todo:去掉扫一扫
            try:
                soup.find('div', id='div_div').decompose()
            except:
                continue
            # log.info(title)

            fu_jian_soup = soup.find_all('a')
            for file in fu_jian_soup:
                try:
                    file_href = file['href']
                except Exception as e:
                    log.info(f'---{href[0]}--------{e}-------')
                    continue
                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
                    category = os.path.splitext(file_href)[1]
                    if category not in file_name:
                        file_name = file_name + category
                    retData = baseCore.uptoOBS(file_href, '1667', file_name)
                    if retData['state']:
                        pass
                    else:
                        continue
                    att_id, full_path = baseCore.tableUpdate(retData, '北京市国资委', file_name, num, pub_time)
                    id_list.append(att_id)

                    # todo:将返回的地址更新到soup
                    file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)

            # id_ = redefid(id_list)
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # todo:传kafka字段
            dic_news = {
                'attachmentIds': id_list,
                'author': '',
                'content': str(soup.text),
                'contentWithTag': str(soup),
                'createDate': time_now,
                'deleteFlag': 0,
                'id': '',
                'labels': [{'relationId': "1667", 'relationName': "北京市国资委", 'labelMark': "policy"}],
                'origin': pub_source,
                'organ': organ,
                'topicClassification': topic,
                'issuedNumber': pub_hao,
                'publishDate': pub_time,
                'writtenDate': writtenDate,
                'sid': '1697458829758697473',
                'sourceAddress': href[0],
                'summary': '',
                'title': title
            }
            # print(dic_news)
            flag = baseTool.sendKafka(dic_news)
            if flag:
                baseTool.save_data(dic_news)
                num += 1
                count += 1
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')
        bro.quit()
    except Exception as e:
        log.info(e)
        pass
if __name__ == "__main__":
    bei_jing()