import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from selenium.webdriver.common.by import By

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 甘肃
def gan_su():
    pathType = 'policy/gansu/'

    def gan_su1():
        num = 0
        count = 0
        start_time = time.time()
        bro = baseTool.getDriver()
        urls = ['http://gzw.gansu.gov.cn/gzw/c115543/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115544/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115545/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115546/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115547/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115548/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115549/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115550/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115551/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115554/xxgk_list.shtml']
        for url in urls:
            hrefs = []
            try:
                for i in range(0, 4):
                    bro.get(url)
                    time.sleep(2)
                    html = bro.page_source
                    if len(html) < 200:
                        time.sleep(5)
                        continue
                    else:
                        break
                soup = baseTool.paserUrl(html, url)
                doc = pq(str(soup))
                li_list = doc('ul[class="UlTab"]>li')
                for li in li_list:
                    lidoc = pq(li)
                    href_ = lidoc('td[width="400"]>a').attr('href')
                    publishDate = lidoc('td[width="100"]').text()
                    dtl = {
                        'href': href_,
                        'publishDate': publishDate
                    }
                    hrefs.append(dtl)
                for dd in hrefs:
                    href = dd['href']
                    publishDate = dd['publishDate']
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    for i in range(0, 4):
                        bro.get(href)
                        time.sleep(2)
                        dhtml = bro.page_source
                        if dhtml == '<html><head></head><body></body></html>':
                            bro.close()
                            bro.quit()
                            bro = baseTool.getDriver()
                            bro.get(href)
                            dhtml = bro.page_source
                        if len(dhtml) < 200:
                            time.sleep(5)
                            continue
                        else:
                            break
                    title = str(bro.find_element(By.CLASS_NAME, 'links_tit').text)
                    links_tab = str(bro.find_element(By.CLASS_NAME, 'links_tab').text)
                    organ = links_tab.split('发布机构')[1].split('主题分类')[0].replace('：', '').strip().lstrip()
                    pub_hao = links_tab.split('文号')[1].split('浏览次数')[0].replace('：', '').strip().lstrip()
                    writtenDate = links_tab.split('生成日期')[1].split('关键字')[0].replace('：', '').strip().lstrip()
                    contentWithTag = str(bro.find_element(By.CLASS_NAME, 'links_words').get_attribute('innerHTML'))
                    soup = baseTool.paserUrl(contentWithTag, href)
                    fu_jian_soup = soup.find_all('a')
                    id_list = []
                    for file in fu_jian_soup:
                        try:
                            file_href = file['href']
                        except Exception as e:
                            continue
                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
                            if category not in file_name:
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1696', file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num, publishDate)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    content = soup.text
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
                        continue
                    # t = time.strptime(publishDate, "%Y年%m月%d日")
                    # publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': str(content),
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}],
                        'origin': '',
                        'organ': organ,
                        'topicClassification': "",
                        'issuedNumber': pub_hao,
                        'publishDate': publishDate,
                        'writtenDate': writtenDate,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }
                    # print(dic_news)
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        num += 1
                        count += 1
            except Exception as e:
                print(e)
                pass
        bro.quit()
        end_time = time.time()
        log.info(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def gan_su2():
        num = 0
        count = 0
        start_time = time.time()
        bro = baseTool.getDriver()
        url = 'http://gzw.gansu.gov.cn/gzw/c115552/xxgk_list.shtml'
        hrefs = []
        try:
            bro.get(url)
            time.sleep(3)
            bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            time.sleep(1)
            html = bro.page_source
            soup = baseTool.paserUrl(html, url)
            doc = pq(str(soup))

            li_list = doc('ul[class="UlTab"]>li')
            for li in li_list:
                lidoc = pq(li)
                href_ = lidoc('td[width="400"]>a').attr('href')
                publishDate = lidoc('td[width="100"]').text()
                dtl = {
                    'href': href_,
                    'publishDate': publishDate
                }
                hrefs.append(dtl)
            for dd in hrefs:
                try:
                    href = dd['href']
                    publishDate = dd['publishDate']
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    bro.get(href)
                    try:
                        alls = bro.find_element(By.CLASS_NAME, 'alls').text
                        if '全文' in alls:
                            bro.find_element(By.CLASS_NAME, 'alls').click()
                    except:
                        pass
                    time.sleep(3)
                    html = bro.page_source
                    if html == '<html><head></head><body></body></html>':
                        bro.close()
                        bro.quit()
                        bro = baseTool.getDriver()
                        bro.get(href)
                        html = bro.page_source
                    doc = pq(html)
                    origin = ''
                    pub_hao = ''
                    topicClassification = ''
                    topicClassification = doc('table[class="bd1"]>tbody>tr:nth-child(1)>td:nth-child(4)').text()
                    organ = doc('table[class="bd1"]>tbody>tr:nth-child(2)>td:nth-child(2)').text()
                    writtenDate = doc('table[class="bd1"]>tbody>tr:nth-child(2)>td:nth-child(4)').text()
                    title = doc('table[class="bd1"]>tbody>tr:nth-child(3)>td:nth-child(2)').text()
                    pub_hao = doc('table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(2)').text()
                    contentWithTag = doc('div[id="UCAP-CONTENT"]')
                    if len(title) == 0:
                        topicClassification = doc(
                            'div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(4)').text()
                        organ = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
                        title = doc('table[class="bd1"]>tbody>tr:nth-child(3)>td:nth-child(2)').text()
                        writtenDate = doc('table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(2)').text()
                        pub_hao = doc('table[class="bd1"]>tbody>tr:nth-child(5)>td:nth-child(2)').text()
                        contentWithTag = doc('div[id="content"]')
                    if len(title) == 0:
                        title = doc('h1[id="ti"]').text()
                        writtenDate = doc('div[class="pages-date"]').text().split(' 来源：')[0]
                        origin = doc('div[class="pages-date"]>span[class="font"]').text().replace("来源：", "")
                        if len(origin) < 1:
                            origin = doc('div[class="pages-date"]>span').text().replace("来源：", "")
                        contentWithTag = doc('div[id="UCAP-CONTENT"]')
                    if len(title) == 0:
                        title = doc('div[class="links_tit"]').text()
                        writtenDate = doc('div[class="links_tab"]>table>tbody>tr:nth-child(4)>td:nth-child(2)').text()
                        origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
                        pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
                        contentWithTag = doc('div[id="content"]')
                        # print(title)

                    soup = baseTool.paserUrl(str(contentWithTag), href)
                    try:
                        div_tag = soup.find(id="div_div")
                        div_tag.extract()
                        editor = soup.find(class_='editor')
                        editor.extract()
                    except:
                        pass
                    fu_jian_soup = soup.find_all('a')
                    id_list = []
                    for file in fu_jian_soup:
                        try:
                            file_href = file['href']
                        except Exception as e:
                            continue
                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
                            if category not in file_name:
                                file_name = file_name + category
                            log.info(f'{file_name}---{href}--')
                            retData = baseCore.uptoOBS(file_href, '1696', file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num, publishDate)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)

                    contentWithTag = str(soup.prettify())
                    content = soup.text
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
                        continue
                    if len(content) < 2:
                        continue
                    # t = time.strptime(publishDate, "%Y年%m月%d日")
                    # publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': str(content),
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}],
                        'origin': origin,
                        'organ': organ,
                        'topicClassification': topicClassification,
                        'issuedNumber': pub_hao,
                        'publishDate': publishDate,
                        'writtenDate': writtenDate,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }
                    # print(dic_news)
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                    num += 1
                    count += 1
                except Exception as e:
                    print(e)
        except Exception as e:
            print(e)
            pass
        bro.quit()
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def gan_su3():
        num = 0
        count = 0
        start_time = time.time()
        bro = baseTool.getDriver()
        url = 'http://gzw.gansu.gov.cn/gzw/c115553/xxgk_list.shtml'
        hrefs = []
        try:
            bro.get(url)
            time.sleep(2)
            bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            while True:
                time.sleep(1)
                ul = bro.find_element(By.CLASS_NAME, 'UlTab')
                li_list = ul.find_elements(By.TAG_NAME, 'li')
                for li in li_list:
                    href_ = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    try:
                        publishDate = li.find_elements(By.XPATH, './/td[@width="100"]')[0].text
                        # publishDate=li.find_element(by='width',value='100').text
                    except Exception as e:
                        publishDate = ''
                        pass
                    dtl = {
                        'href': href_,
                        'publishDate': publishDate
                    }
                    hrefs.append(dtl)
                try:
                    # bro.find_element(By.CLASS_NAME, 'nextpage').click()
                    bro.find_element(By.CLASS_NAME, 'nextbtn').click()
                except:
                    break
            for dd in hrefs:
                href = dd['href']
                publishDate = dd['publishDate']
                is_href = baseTool.db_storage.find_one({'网址': href})
                if is_href:
                    num += 1
                    continue
                try:
                    bro.get(href)
                    time.sleep(3)
                    html = bro.page_source
                    if html == '<html><head></head><body></body></html>':
                        bro.close()
                        bro.quit()
                        bro = baseTool.getDriver()
                        bro.get(href)
                        html = bro.page_source
                    doc = pq(html)
                    origin = ''
                    pub_hao = ''
                    topicClassification = ''
                    topicClassification = doc('div[class="contenttitle"]>ul>li:nth-child(2)>font>span').text()
                    organ = doc('div[class="contenttitle"]>ul>li:nth-child(3)>font>span').text()
                    writtenDate = doc('div[class="contenttitle"]>ul>li:nth-child(4)>font>span').text()
                    title = doc('div[class="contenttitle"]>ul>li:nth-child(5)>font>span').text()
                    pub_hao = doc('div[class="contenttitle"]>ul>li:nth-child(6)>font>span').text()
                    publishDate = doc('div[class="contenttitle"]>ul>li:nth-child(7)>font>span').text()
                    contentWithTag = doc('div[id="detailContent"]')
                    if len(title) == 0:
                        topicClassification = doc(
                            'div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(4)').text()
                        organ = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
                        title = doc('table[class="bd1"]>tbody>tr:nth-child(3)>td:nth-child(2)').text()
                        writtenDate = doc('table[class="bd1"]>tbody>tr:nth-child(4)>td:nth-child(2)').text()
                        pub_hao = doc('table[class="bd1"]>tbody>tr:nth-child(5)>td:nth-child(2)').text()
                        contentWithTag = doc('div[id="content"]')
                    if len(title) == 0:
                        title = doc('h1[id="ti"]').text()
                        writtenDate = doc('div[class="pages-date"]').text().split(' 来源：')[0]
                        origin = doc('div[class="pages-date"]>span[class="font"]').text().replace("来源：", "")
                        if len(origin) < 1:
                            origin = doc('div[class="pages-date"]>span').text().replace("来源：", "")
                        contentWithTag = doc('div[id="UCAP-CONTENT"]')
                    if len(title) == 0:
                        title = doc('div[class="links_tit"]').text()
                        writtenDate = doc('div[class="links_tab"]>table>tbody>tr:nth-child(4)>td:nth-child(2)').text()
                        origin = doc('div[class="links_tab"]>table>tbody>tr:nth-child(2)>td:nth-child(2)').text()
                        pub_hao = doc('div[class="links_tab"]>table>tbody>tr:nth-child(5)>td:nth-child(2)').text()
                        contentWithTag = doc('div[id="content"]')
                        # print(title)
                    if len(title) == 0 or contentWithTag.text() == '':
                        title = doc('div[class="main"]>h1').text().lstrip().strip()
                        writtenDate = \
                        doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('日期：')[0].split(' ')[
                            0].lstrip().strip()
                        origin = doc('div[class="main"]>div[class="clearbox"]>p:nth-child(1)').text().split('来源：')[
                            0].lstrip().strip()
                        contentWithTag = doc('div[class="detailContent"]')
                        # print(title)

                    soup = baseTool.paserUrl(str(contentWithTag), href)
                    try:
                        div_tag = soup.find(id="div_div")
                        div_tag.extract()
                        editor = soup.find(class_='editor')
                        editor.extract()
                    except:
                        pass
                    fu_jian_soup = soup.find_all('a')
                    id_list = []
                    for file in fu_jian_soup:
                        try:
                            file_href = file['href']
                        except Exception as e:
                            continue
                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
                            if category not in file_name:
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1696', file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '甘肃省国资委', file_name, num, publishDate)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)

                    contentWithTag = str(soup.prettify())
                    content = soup.text
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
                        # print(bro.page_source)
                        continue
                    if len(content) < 2:
                        continue
                    # t = time.strptime(publishDate, "%Y年%m月%d日")
                    # publishDate = time.strftime("%Y-%m-%d %H:%M:%S", t)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': str(content),
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1696", 'relationName': "甘肃省国资委", 'labelMark': "policy"}],
                        'origin': origin,
                        'organ': organ,
                        'topicClassification': topicClassification,
                        'issuedNumber': pub_hao,
                        'publishDate': publishDate,
                        'writtenDate': writtenDate,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }
                    # print(dic_news)
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        num += 1
                        count += 1
                except Exception as e:
                    ee = e.__traceback__.tb_lineno
                    print(ee, e)
        except:
            pass
        bro.quit()
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    gan_su1()
    gan_su2()
    gan_su3()

if __name__ == "__main__":
    gan_su()