import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 海南
def hai_nan():
    pathType = 'policy/hainan/'

    def hai_nan1():
        # 部门文件
        num = 0
        count = 0
        start_time = time.time()
        for page in range(13):
            if page == 0:
                url = "http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/"
            else:

                url = 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/' + '/' + 'index_{}.html'.format(page)
            try:
                resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
                doc_resp = pq(resp_text)
                doc_items = doc_resp('.list-right_title').items()
                for doc_item in doc_items:
                    id_list = []
                    pub_time = doc_item.next().text().replace('发布时间： ', '') + ' 00:00:00'
                    title = doc_item('a:nth-child(2)').text().strip()
                    href = doc_item('a:nth-child(2)').attr('href')
                    if '../../' in href:
                        href = 'http://gzw.hainan.gov.cn/zwgk_23509/' + href.replace('../../', '')
                    elif './' in href:
                        href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/')
                    try:
                        is_href1 = baseTool.db_storage.find_one({'网址': href.split('?')[0]})
                        is_href2 = baseTool.db_storage.find_one({'网址': href})

                    except:
                        is_href1 = False
                        is_href2 = baseTool.db_storage.find_one({'网址': href})
                    if is_href1 or is_href2:
                        num += 1
                        log.info('已采集=====跳过')
                        continue
                    try:
                        try:

                            # print(href)
                            # href = 'http://gzw.hainan.gov.cn/zwgk_23509/gkbz/zfxxgkml/202211/t20221114_3307081.html'
                            href_text = requests.get(url=href, headers=baseTool.headers, verify=False).content
                            doc_href = pq(href_text)
                            pub_result = doc_href('.xxgk-syxl-t1023.clear').remove('script').text().replace(' ',
                                                                                                            '').replace(
                                '　　', '')
                            # organ
                            pub_source = pub_result.split('发文机关：')[1].split('成文日期：')[0].strip()
                            origin = ''
                            pub_hao = pub_result.split('文号：')[1].split('发布日期：')[0].strip()
                            topicClassification = pub_result.split('分类：')[1].split('发文机关：')[0].strip()
                            writtenDate = pub_result.split('成文日期：')[1].split('文号：')[0].strip()
                            # content = str(doc_href('.xxgk-syxl').children()).replace('扫一扫在手机打开当前页', '')
                            sp = doc_href('p[style="font-size: 16px;font-weight: bold;line-height: 30px;"]>script')
                            sp.remove()
                            content1 = doc_href('.xxgk-syxl-m-l-nr2')
                            soup = BeautifulSoup(str(content1), 'html.parser')
                            # 相对路径转化为绝对路径
                            contentWithTag = baseTool.paserUrl(soup, href)
                            # 去掉扫一扫
                            try:
                                contentWithTag.find('div', id='div_div').decompose()
                            except:
                                pass
                            content = contentWithTag.text
                            if content == '' or content == None:
                                log.info(f'-----{href}----{title}----内容为空-----')
                                continue
                            fu_jian_list = contentWithTag.find_all('a')
                            for fu_jian in fu_jian_list:
                                try:
                                    file_name = fu_jian.text
                                    fu_jian_href = fu_jian['href']
                                except:
                                    continue

                                if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.docx' in fu_jian_href or '.xlsx' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
                                        or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                        or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                    category = os.path.splitext(fu_jian_href)[1]
                                    if category not in file_name:
                                        file_name = file_name + category
                                    # 上传至文件服务器
                                    retData = baseCore.uptoOBS(fu_jian_href, '1677', file_name)
                                    if retData['state']:
                                        pass
                                    else:
                                        continue
                                    att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num,
                                                                             pub_time)
                                    id_list.append(att_id)
                                    # 将文件服务器的链接替换
                                    fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                        except:
                            try:
                                # print(href)
                                resp = requests.get(url=href, headers=baseTool.headers, verify=False)
                                resp.encoding = resp.apparent_encoding
                                resp_text = resp.text
                                source = BeautifulSoup(resp_text, 'html.parser')
                                source = baseTool.paserUrl(source, href)
                                tbody_text = str(source.find('tbody').text)
                                # organ
                                pub_source = tbody_text.split('发文机关：')[1].split('发文日期：')[0].strip().lstrip()
                                origin = ''
                                pub_hao = tbody_text.split('文　　号：')[1].split('主 题 词：')[0].strip().lstrip()
                                pub_time = tbody_text.split('发文日期：')[1].split('名　　称：')[0].strip().lstrip().replace('年',
                                                                                                                   '-').replace(
                                    '月', '-').replace('日', '')
                                writtenDate = None
                                topicClassification = tbody_text.split('分　　类：')[1].split('发文机关：')[0].strip().lstrip()
                                contentWithTag = source.find('div', attrs={'class': 'zx-xxxqy-nr'})
                                content = contentWithTag.text
                                if content == '' or content == None:
                                    log.info(f'-----{href}----{title}----内容为空-----')
                                    continue
                                fu_jian_list = source.find_all('a')
                                try:
                                    for fu_jian in fu_jian_list:
                                        try:
                                            file_name = fu_jian.text
                                            fu_jian_href = fu_jian['href']
                                        except:
                                            continue
                                        if '.doc' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
                                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                            category = os.path.splitext(fu_jian_href)[1]
                                            if category not in file_name:
                                                file_name = file_name + category
                                            # print(f'----附件：{fu_jian_href}-----filename:{file_name}')
                                            # 附件上传至文件服务器
                                            retData = baseCore.uptoOBS(fu_jian_href, '1677', file_name)
                                            if retData['state']:
                                                pass
                                            else:
                                                continue
                                            # 更新到数据库
                                            att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num,
                                                                                     pub_time)
                                            id_list.append(att_id)
                                            fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                                except:
                                    continue

                            except:

                                resp = requests.get(url=href, headers=baseTool.headers, verify=False)
                                resp.encoding = resp.apparent_encoding
                                resp_text = resp.text
                                source = BeautifulSoup(resp_text, 'html.parser')
                                # 去掉扫一扫
                                try:
                                    source.find('div', id='div_div').decompose()
                                    # 相对路径替换为绝对路径
                                    source = baseTool.paserUrl(source, href)
                                except:
                                    # 相对路径替换为绝对路径
                                    source = baseTool.paserUrl(source, href)
                                pub_time = str(source.find('div', attrs={'class': 'con_div'}).text).split('来源：')[
                                    0].lstrip().strip()
                                origin = \
                                    str(source.find('div', attrs={'class': 'con_div'}).text).split('来源：')[1].split(
                                        ' 【字体：')[
                                        0].strip().lstrip()
                                pub_source = ''
                                pub_hao = ''
                                writtenDate = None
                                topicClassification = ''
                                contentWithTag = source.find('div', attrs={'class': 'TRS_UEDITOR'})
                                content = contentWithTag.text
                                if content == '' or content == None:
                                    log.info(f'-----{href}----{title}----内容为空-----')
                                    continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': content,
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1677", 'relationName': "海南省国资委", 'labelMark': "policy"}],
                            'origin': origin,
                            'organ': pub_source,
                            'topicClassification': topicClassification,
                            'issuedNumber': pub_hao,
                            'publishDate': pub_time,
                            'writtenDate': writtenDate,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }

                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            log.info(title)
                            count += 1
                            num = num + 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
        log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

    def hai_nan2():
        def hai_nan_sw(page_href):
            num = 0
            count = 0
            req = requests.get(url=page_href, headers=baseTool.headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.text, 'html.parser')
            doc_items = doc_resp.find_all(class_='list-right_title fon_1')
            for doc_item in doc_items:
                title = str(doc_item).split('target="_blank">')[1].split('</a>')[0]
                href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0]
                # print(title,href)
                try:
                    is_href1 = baseTool.db_storage.find_one({'网址': href.split('?')[0]})
                    is_href2 = baseTool.db_storage.find_one({'网址': href})

                except:
                    is_href1 = False
                    is_href2 = baseTool.db_storage.find_one({'网址': href})
                if is_href1 or is_href2:
                    num += 1
                    log.info('已采集=====跳过')
                    continue
                try:
                    # print(href)
                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
                    doc_href = BeautifulSoup(href_text.content, 'html.parser')
                    # print(doc_href)
                    # 相对路径转化为绝对路径
                    doc_href = baseTool.paserUrl(doc_href, href)
                    # 去掉扫一扫
                    try:
                        doc_href.find('div', id='div_div').decompose()
                    except:
                        pass
                    pub_result = doc_href.find(class_='zwgk_comr1')
                    pub_result = pub_result.find_all('li')
                    # print(pub_result)
                    topicClassification = str(pub_result[0]).split('主题分类：</strong>')[1].split('</span>')[0]
                    # organ
                    pub_source = str(pub_result[1]).split('</strong>')[1].split('</span>')[0]
                    writtenDate = str(pub_result[1]).split('成文日期：</strong>')[1].split('</span>')[0]
                    pub_hao = str(pub_result[3]).split('文  号：</strong>')[1].split('</span>')[0].strip()
                    pub_time = str(pub_result[3]).split('发布日期：</strong>')[1].split('</span>')[0].strip()
                    contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
                    content = contentWithTag.text
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
                        continue
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': [],
                        'author': '',
                        'content': content,
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1677", 'relationName': "海南省国资委", 'labelMark': "policy"}],
                        'origin': '',
                        'organ': pub_source,
                        'topicClassification': topicClassification,
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
                        'writtenDate': writtenDate,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        log.info(title)
                        num += 1
                        count += 1
                    href_text.close()

                except:
                    pass
            req.close()
            return num

        def hai_nan_szf(page_href):
            num = 0
            count = 0
            req = requests.get(url=page_href, headers=baseTool.headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.text, 'html.parser')
            doc_items = doc_resp.find_all(class_='list-right_title fon_1')
            for doc_item in doc_items:
                id_list = []
                title = str(doc_item).split('target="_blank">')[1].split('</a>')[0].replace('\n', '')
                href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0]
                # print(title,href)
                try:
                    is_href1 = baseTool.db_storage.find_one({'网址': href.split('?')[0]})
                    is_href2 = baseTool.db_storage.find_one({'网址': href})

                except:
                    is_href1 = False
                    is_href2 = baseTool.db_storage.find_one({'网址': href})
                if is_href1 or is_href2:
                    num += 1
                    log.info('已采集=====跳过')
                    continue
                try:
                    # print(href)
                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
                    doc_href = BeautifulSoup(href_text.content, 'html.parser')
                    # 相对路径转化为绝对路径
                    doc_href = baseTool.paserUrl(doc_href, href)
                    # 去掉扫一扫
                    try:
                        doc_href.find('div', id='div_div').decompose()
                    except:
                        pass
                    # print(doc_href)
                    try:
                        pub_result = doc_href.find(class_='zwgk_comr1')
                        pub_result = pub_result.find_all('li')
                        # print(pub_result)
                        topicClassification = str(pub_result[0]).split('主题分类：</strong>')[1].split('</span>')[0]
                        # organ
                        origin = ''
                        pub_source = str(pub_result[1]).split('</strong>')[1].split('</span>')[0]
                        pub_hao = str(pub_result[3]).split('文  号：</strong>')[1].split('</span>')[0].strip()
                        writtenDate = str(pub_result[1]).split('成文日期：</strong>')[1].split('</span>')[0]
                        pub_time = str(pub_result[3]).split('发布日期：</strong>')[1].split('</span>')[0].strip()
                        contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
                        content = contentWithTag.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                    except:
                        # print(href)
                        pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
                        topicClassification = ''
                        origin = str(pub_result.text).split('来源：')[1].split(' 【字体：')[0].lstrip().strip()
                        pub_source = ''
                        pub_time = str(pub_result.text).split('来源：')[0].lstrip().strip()
                        pub_hao = ''
                        writtenDate = None,
                        contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
                        content = contentWithTag.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                    fu_jian_list = contentWithTag.find_all('a')
                    for fu_jian in fu_jian_list:
                        try:
                            fu_jian_href = fu_jian['href']
                        except:
                            continue
                        file_name = fu_jian.text
                        if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
                                or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                            category = os.path.splitext(fu_jian_href)[1]
                            if category not in file_name:
                                file_name = file_name + category
                            # 上传至文件服务器
                            retData = baseCore.uptoOBS(fu_jian_href, '1677', file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num, pub_time)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
                            fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                            # print(f'附件：{fu_jian_href}')
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': content,
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1677", 'relationName': "海南省国资委", 'labelMark': "policy"}],
                        'origin': origin,
                        'organ': pub_source,
                        'topicClassification': topicClassification,
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
                        'writtenDate': writtenDate,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }

                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        log.info(title)
                        num += 1
                        count += 1
                    href_text.close()
                    # save_data(result_dict)

                except:
                    pass
            req.close()
            return num

        def hai_nan_szfbgt(page_href):
            num = 0
            count = 0
            req = requests.get(url=page_href, headers=baseTool.headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.text, 'html.parser')
            doc_items = doc_resp.find_all(class_='list-right_title fon_1')
            for doc_item in doc_items:
                id_list = []
                title = str(doc_item).split('target="_blank">')[1].split('</a>')[0]
                href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0]
                # print(title,href)
                try:
                    is_href1 = baseTool.db_storage.find_one({'网址': href.split('?')[0]})
                    is_href2 = baseTool.db_storage.find_one({'网址': href})

                except:
                    is_href1 = False
                    is_href2 = baseTool.db_storage.find_one({'网址': href})
                if is_href1 or is_href2:
                    num += 1
                    log.info('已采集=====跳过')
                    continue
                try:
                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
                    doc_href = BeautifulSoup(href_text.content, 'html.parser')
                    # 相对路径转化为绝对路径
                    doc_href = baseTool.paserUrl(doc_href, href)
                    # print(doc_href)
                    try:
                        # print(href)
                        pub_result = doc_href.find(class_='zwgk_comr1')
                        pub_result = pub_result.find_all('li')
                        # print(pub_result)
                        topicClassification = str(pub_result[0]).split('主题分类：</strong>')[1].split('</span>')[0]
                        # organ
                        origin = ''
                        pub_source = str(pub_result[1]).split('</strong>')[1].split('</span>')[0]
                        writtenDate = str(pub_result[1]).split('成文日期：</strong>')[1].split('</span>')[0]
                        try:
                            pub_hao = str(pub_result[3]).split('文  号：</strong>')[1].split('</span>')[0].strip()
                        except:
                            pub_hao = str(pub_result[3]).split('文       号：</strong>')[1].split('</span>')[0].strip()

                        pub_time = str(pub_result[3]).split('发布日期：</strong>')[1].split('</span>')[0].strip()
                        contentWithTag = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
                        content = contentWithTag.text
                    except:
                        # print(href)
                        pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
                        topicClassification = ''
                        origin = str(pub_result.text).split('来源：')[1].split(' 【字体：')[0].lstrip().strip()
                        pub_time = str(pub_result.text).split('来源：')[0].lstrip().strip()
                        pub_hao = ''
                        pub_source = ''
                        writtenDate = None,
                        contentWithTag = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
                        content = contentWithTag.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                    fu_jian_list = contentWithTag.find_all('a')
                    if fu_jian_list:
                        for fu_jian in fu_jian_list:
                            try:
                                fu_jian_href = fu_jian['href']
                            except:
                                continue
                            file_name = fu_jian.text
                            if '.doc' in fu_jian_href or '.docx' in fu_jian_href or '.xlsx' in fu_jian_href or '.pdf' in fu_jian_href or '.xls' in fu_jian_href or '.zip' in fu_jian_href \
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                category = os.path.splitext(fu_jian_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                # 上传至文件服务器
                                retData = baseCore.uptoOBS(fu_jian_href, '1677', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '海南省国资委', file_name, num, pub_time)
                                id_list.append(att_id)
                                fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                                # print(f'----附件：{fu_jian_href}')
                    else:
                        pass
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': content,
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1677", 'relationName': "海南省国资委", 'labelMark': "policy"}],
                        'origin': origin,
                        'organ': pub_source,
                        'topicClassification': topicClassification,
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
                        'writtenDate': writtenDate,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }

                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        log.info(title)
                        num += 1
                        count += 1
                    href_text.close()
                except:
                    pass
            req.close()
            return num

        def hai_nan_zy(page_href):
            num = 0
            count = 0
            req = requests.get(url=page_href, headers=baseTool.headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.content, 'html.parser')
            list_div = doc_resp.find('div', attrs={'class': 'list list_1 list_2'})
            doc_items = list_div.find_all('li')
            for doc_item in doc_items:
                title = str(doc_item.find('a').text)
                i_href = doc_item.find('a').get('href')
                # https://www.gov.cn/zhengce/202307/content_6893055.htm
                if 'https://www.gov.cn/zhengce/' not in i_href:
                    i_href = str(i_href).replace('../../', 'https://www.gov.cn/zhengce/')
                try:
                    try:
                        is_href1 = baseTool.db_storage.find_one({'网址': i_href.split('?')[0]})
                        is_href2 = baseTool.db_storage.find_one({'网址': i_href})

                    except:
                        is_href1 = False
                        is_href2 = baseTool.db_storage.find_one({'网址': i_href})
                    if is_href1 or is_href2:
                        num += 1
                        log.info('已采集=====跳过')
                        continue
                    if i_href == 'https://www.gov.cn/jrzg/2013-11/27/content_2536600.htm':
                        num += 1
                        continue
                    if i_href == 'https://www.gov.cn/jrzg/2013-09/28/content_2497241.htm':
                        num += 1
                        continue
                    # print(f'中央----{i_href}----')
                    href_text = requests.get(url=i_href, headers=baseTool.headers, verify=False)
                    doc_href = BeautifulSoup(href_text.content, 'html.parser')
                    # 相对路径转化为绝对路径
                    doc_href = baseTool.paserUrl(doc_href, i_href)
                    # 去掉扫一扫
                    try:
                        doc_href.find('div', id='div_div').decompose()
                    except:
                        pass
                    try:
                        pub_result = doc_href.find('table', class_='bd1').find_all('td')
                        pub_time = \
                            str(pub_result[13]).replace('年', '-').replace('月', '-').replace('日', '').split('<td>')[
                                1].split('</td>')[0]
                        pub_source = str(pub_result[5]).split('<td>')[1].split('</td>')[0]
                        pub_hao = str(pub_result[11]).split('<td>')[1].split('</td>')[0]
                        contentWithTag = doc_href.find(class_='b12c')
                        content = contentWithTag.text
                    except:
                        try:
                            pub_result = doc_href.find(class_='pages-date')
                            pub_source = pub_result.find('span', class_='font').text.replace('来源：', '').strip()
                            pub_time = str(pub_result).split('<span')[0].split('"pages-date">')[1].split('来源')[
                                0].strip()
                        except:
                            pub_source = ''
                            pub_time = None
                        pub_hao = ''
                        contentWithTag = doc_href.find(class_='pages_content')
                        content = contentWithTag.text
                        if content == '' or content == None:
                            log.info(f'-----{i_href}----{title}----内容为空-----')
                            continue
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': [],
                        'author': '',
                        'content': content,
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1677", 'relationName': "海南省国资委", 'labelMark': "policy"}],
                        'origin': '',
                        'organ': pub_source,
                        'topicClassification': '',
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': i_href,
                        'summary': '',
                        'title': title
                    }

                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        log.info(title)
                        num += 1
                        count += 1
                    href_text.close()
                    # save_data(result_dict)

                except:
                    pass
            req.close()
            return num

        def start():
            num = 0
            count = 0
            start_time = time.time()
            url = "https://www.hainan.gov.cn/hainan/qzcwj/zywj.shtml"
            try:
                req = requests.get(url=url, headers=baseTool.headers, verify=False)
                req.encoding = req.apparent_encoding
                doc_resp = pq(req.text)
                doc_items = doc_resp('.nzcti').items()
                leibie_href_list = []
                for doc_item in doc_items:
                    # print(doc_item)
                    leibie = doc_item('a').text()
                    leibie_href = doc_item('a').attr('href')
                    if '更多' in leibie:
                        leibie = leibie.split('更多>> ')[1]
                        # print(leibie)
                        leibie_href = 'https://www.hainan.gov.cn' + doc_item('a').attr('href')
                    leibie_href_list.append(leibie_href)
                # 每一个类别的文件
                for url in leibie_href_list:
                    # 翻页
                    if url == leibie_href_list[0]:

                        max_page = 23
                        for page in range(max_page):
                            if max_page == 0:
                                page_href = str(url) + 'home.htm'
                            else:
                                page_href = str(url) + f'home_{page}.htm'
                            try:
                                count += hai_nan_zy(page_href)
                            except:
                                pass
                            time.sleep(1)
                    elif url == leibie_href_list[1]:

                        # https://www.hainan.gov.cn/hainan/swygwj/list3_2.shtml
                        max_page = 8
                        for page in range(max_page):
                            if page == 0:
                                page_href = 'https://www.hainan.gov.cn/hainan/swygwj/list3.shtml'
                            else:
                                page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
                            try:
                                count += hai_nan_sw(page_href)
                            except:
                                pass
                    elif url == leibie_href_list[2]:

                        max_page = 84
                        for page in range(max_page):
                            if page == 0:
                                page_href = 'https://www.hainan.gov.cn/hainan/szfwj/list3.shtml'
                            else:
                                page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
                            try:
                                count += hai_nan_szf(page_href)
                            except:
                                pass
                    else:
                        max_page = 84
                        for page in range(max_page):
                            if page == 0:
                                page_href = 'https://www.hainan.gov.cn/hainan/szfbgtwj/list3.shtml'
                            else:
                                page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
                            try:
                                count += hai_nan_szfbgt(page_href)
                            except:
                                pass
            except:
                pass
            end_time = time.time()
            log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

        start()

    hai_nan1()
    hai_nan2()

if __name__ == "__main__":
    hai_nan()