# _*_ coding:utf-8 _*_

"""数据全量跑一遍，不做判重逻辑"""
import json
import re
import time

import fitz
import pymongo
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
from pyquery import PyQuery as pq
from requests.packages import urllib3

from BaseCore import BaseCore
baseCore = BaseCore()

urllib3.disable_warnings()
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from lxml import etree
from random import choice

log = baseCore.getLogger()
taskType = '政策法规'

"""
国务院文件
国务院部门文件
国务院-政策发布
各地方国资委
"""

db_storage = pymongo.MongoClient('mongodb://114.115.221.202:27017', username='admin', password='zzsn@9988').caiji['国务院_国资委']

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
}

def replaceUrl(hostUrl,src):
    if '../' in src:
        src = src.strip('../')
    if './' in src:
        src = src.strip('.')
    finnal_href = hostUrl + src
    return finnal_href

def save_data(result_dict):
    try:
        aa = result_dict['信息来源']
        a_dict = result_dict
    except:
        try:
            tid = result_dict['tid']
        except:
            tid = '1666'
            pass
        a_dict = {
            '标题': result_dict['标题'],
            '来源': result_dict['来源'],
            '发文机关': '',
            '发文字号': result_dict['号'],
            '内容-未去标签': result_dict['内容'],
            '附件网址': result_dict['附件网址'],
            '发布时间': result_dict['发布时间'],
            '成文时间': '',
            '主题分类': '',
            '网址': result_dict['网址'],
            '归属': result_dict['归属'],
            '信息来源': '地方国资委',
            'tid': tid,
        }
    # a_dict['内容-未去标签'] = a_dict['内容-未去标签'].split('扫一扫在手机打开')[0]
    #
    if a_dict['标题']:
        pass
    else:
        return
    try:
        post_url = 'http://39.105.62.235:1820/ExtarctLawInfo'
        headers_ = {
            'Content-Type': 'application/json'
        }
        resp = requests.post(post_url, headers=headers_, verify=False, data=json.dumps(a_dict))
        if resp.status_code == 500:
            try:
                tid = result_dict['tid']
            except:
                tid = '1666'
            a_dict = {
                '标题': result_dict['标题'],
                '来源': result_dict['来源'],
                '发文机关': '',
                '发文字号': result_dict['号'],
                '内容-未去标签': '--',
                '附件网址': result_dict['附件网址'],
                '发布时间': result_dict['发布时间'],
                '成文时间': '',
                '主题分类': '',
                '网址': result_dict['网址'],
                '归属': result_dict['归属'],
                '信息来源': '地方国资委',
                'tid': tid,
            }
            resp = requests.post(post_url, headers=headers_, verify=False, data=json.dumps(a_dict))
        print('推送：', resp.status_code)
        if resp.status_code != 200:
            print('推送失败！')
            time.sleep(10)
            a_dict['is_send'] = ''
            db_storage.insert_one(a_dict)
            return
    except:
        print('推送失败！')
        time.sleep(10)
        a_dict['is_send'] = ''
        db_storage.insert_one(a_dict)
        return
    db_storage.insert_one(a_dict)

def sendKafka(dic_news):
    start_time = time.time()
    try:#114.116.116.241
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
        kafka_result = producer.send("policy",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

        print(kafka_result.get(timeout=10))

        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        log.info(dic_result)
        # 传输成功,写入日志中
        state = 1
        takeTime = baseCore.getTimeCost(start_time, time.time())
        # return True

    except Exception as e:

        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        log.error(dic_result)
        e = 'Kafka操作失败'
        state = 0
        takeTime = baseCore.getTimeCost(start_time, time.time())

def redefid(idList):
    id_ = ','.join(map(str, idList))
    return id_

def get_content1():
    start_time = time.time()
    num = 0
    # 过网站验证所需  athenaAppKey  athenaAppName
    athenaAppKeys = [
        'ZfaiEpAY%2B%2FYj5RjJrDfj2dn%2BcS4WRoxcLidI68z5l6SH8WE8CXSVP7QkJNIhy%2Bng4mZcwCuOFKNUvj%2FH6mR7sx0sCwgkIAfq4XNfHY6Fy7fxQ0NWm%2Fx7rmB5ow5OPdW5NMdI2RzURAFCVA9aIV4a1W8TFOjbQWYOukxUFtVJibU%3D',
        'XU4ULRMBYHbE0I2fNNgCBxYTAg5Dk%2FPUEN9XeKy4OOAPdcZ6DW%2BrsopeI1gUwPmq3Y%2FtZJhH3NXiXqH5RxBAoYO231FHjPaMxD6QaMlA2BUeOFxmGKnuvJnIby1k6RmrCFd6IoXSImm5RFgcVed%2FvL6Qie2o6BAkRaEUHAitK18%3D',
        'IFrNB5NkaDApRpF09SoT4fVBUoi7gRF2prj4EHk8eIVSEc1yYPpAZWVDMnqc2lVmeaQcNvrvZffA2kPVdvqjUHV5lGPJccRK3epnJ5Xx3xwIfTG7iIgrjFlqK1I93E0SIP6wyZJu42ksnF3nJdZ31sLEDCBeLi3pkggFtIEIQsg%3D',
        'Nkgtgnyd%2B6jfdlclssI8FB9xRTQDdWzreONdqvta2aKZMRlhWoHhdj6L%2BQRyD8InaLWJC1zCSOkIy5b%2BjjZTg80t2jPu%2F1ifcRnboIj8%2BDIYWNSxMu%2Fdxze7oPtPo6sR08%2B3tQOE3ZntyFsGT44vCpa6DgK8ee3C5S58lanYXuI%3D',
        'Zcko%2F7%2F2f2EuUmKpXbWnK3JtZtVy4trUNyE2JA5jVIw2r1oxTXVNZy8KQDmnOPDfyazdOrH6VYaJWloE4MukMK4VloB%2BRy6QhEaUvm%2Fsp4Enzl7doEk%2B1sZ1Y2iUd5REIhJQ%2Bp%2BB5iJEeNTmlQuRzYU3kOjDYtXftuehRTNKiXk%3D',
        'Y2guFVvdtqMPhx5s9xThqdkvbe5hPaTlV7BYhcDuK7l%2BaXUqUMUHdim3uzn9IRlbHUtOLmRk6tfPEFM%2B8vzGDvI8U48acQ8Ff6MsfOGxShrQ7kW4tr4NaoE1sBW3PNkWj1Z0K6JzSXmAS2C1zVchTUYzTlfk62ghIeDtIPsPa6s%3D',
        'a1drgLsStJotfBqHp1cFQg4lTJMMbgkTjVgCv34uy4Q%2BQ86DNEdc%2Fst0dZTUWFttuyXKNIH8%2FPYSSk465lXIn4wfuG4GuZLUk6wQo5PHNCUP0%2FvIL63IUxT0DCMo7lbsPq0ncdh4aiVswJe%2F6LM9U1m9OoaNGbeIUOl%2FxIOrMnE%3D',
        'SGwQuPLZq2UzfaBPSwcR8DGZa4Ckh3Amp%2Bc1tMBFsMp%2Fh7Qn%2B9nspxdI3CW9S5LlfxYQmfa%2F%2B%2BJdH%2BBnxt0ILiCA4o9TUOxx27MhN9b4CLZnD8ZJ6sOwMszdFToDAD7hE21%2FzCzxhPNzPbyMXPpeMdi6sY0O2Sd85PLDtlZv%2FYQ%3D',
        'CbvDEoIrP1%2BgMOuRJFhJNUGhzHBGnwdI6lIVG1ns1ZaLTlGRXLRgMjh9nBwLGMLTZwlPskklMbygvfA4P5UGhGT%2FpqKFkZne%2FAzTK8U6oJMo5%2FNAczbHhKwG7gdepIiiI7CgeNDtP8kurkcxnVS2KA1CLo8CVzmMlLHRmMPI8ag%3D']
    athenaAppKey = choice(athenaAppKeys)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
        'athenaAppKey': f"{athenaAppKey}",
        'athenaAppName': "%E5%9B%BD%E7%BD%91%E6%90%9C%E7%B4%A2",
        'Content-Type': 'application/json;charset=UTF-8',
    }
    headers_ = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    }

    requests.adapters.DEFAULT_RETRIES = 5

    url = 'https://sousuoht.www.gov.cn/athena/forward/486B5ABFBAD0FF5743F5E82E007EF04DDD6388E7989E9EC9CC7B84917AC81A5F'
    result_list = [['国令', "1108"], ['国发', "1107"], ['国函', "1106"], ['国发明电', "1105"], ['国办发', "1104"],
                   ['国办函', "1103"],
                   ['国办发明电', "1102"], ['其他', "1101"]]
    try:
        for a_list in result_list:
            s = requests.session()
            s.keep_alive = False
            pageNo = 1
            pcodeJiguan = a_list[0]
            # post请求所需参数
            data = {"code": "18122f54c5c", "thirdPartyCode": "thirdparty_code_107", "thirdPartyTableId": 30,
                    "resultFields": ["pub_url", "maintitle", "fwzh", "cwrq", "publish_time"],
                    "trackTotalHits": "true",
                    "searchFields": [{"fieldName": "maintitle", "searchWord": ""}], "isPreciseSearch": 0,
                    "sorts": [{"sortField": "publish_time", "sortOrder": "DESC"}], "childrenInfoIds": [[a_list[1]]],
                    "pageSize": 20, "pageNo": pageNo}
            data = json.dumps(data)
            res = s.post(url=url, headers=headers, data=data, verify=False)
            # 获得结果为json格式
            res_text = json.loads(res.text)
            page_list = res_text['result']['data']['list']
            s.close()
            for page in page_list:
                # 获取所需信息
                title = page['maintitle']
                pub_time1 = page['publish_time']
                pub_time2 = page['cwrq']
                pub_code = page['fwzh']
                href = page['pub_url']
                # 判断是否已经爬取过
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    resp_href = requests.get(url=href, headers=headers_, verify=False)
                    resp_href.encoding = resp_href.apparent_encoding
                    i_html = resp_href.text
                    if '您访问的页面不存在或已删除' in i_html:
                        continue
                    i_soup = BeautifulSoup(i_html, 'html.parser')
                    source = str(i_soup.find_all('tbody')[0])
                    pub_org = source.split('<td><b>发文机关：</b></td>')[1].split('<td>')[1].split('</td>')[0]
                    child_type = source.split('<td class="w340 zcwj_ztfl">')[1].split('</td>')[0]
                    content = str(i_soup.find('table', attrs={'class': 'pages_content'}))
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': '',
                        '发文机关': pub_org,
                        '发文字号': pub_code,
                        '内容-未去标签': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time1,
                        '成文时间': pub_time2,
                        '主题分类': child_type,
                        '网址': href,
                        '归属': pcodeJiguan,
                        '信息来源': '国务院文件',
                        'tid': 1766,
                    }
                    resp_href.close()
                    print(title)
                    # save_data(result_dict)
                    # time.sleep(1)
                    num += 1
                except:
                    pass
    except:
        pass
    end_time = time.time()
    print(f'共抓取{num}条数据，共耗时{start_time - end_time}')


# 国务院部门文件
def get_content2():
    start_time = time.time()
    num = 0
    result_list = ['外交部', '国家发展和改革委员会', '教育部', '科学技术部', '工业和信息化部', '国家民族事务委员会', '公安部', '国家安全部', '民政部', '司法部', '财政部',
                   '人力资源和社会保障部', '自然资源部', '生态环境部', '住房和城乡建设部', '交通运输部', '水利部', '农业农村部', '商务部', '文化和旅游部',
                   '国家卫生健康委员会',
                   '退役军人事务部',
                   '应急管理部', '人民银行', '审计署', '国务院国有资产监督管理委员会', '海关总署', '国家税务总局', '国家市场监督管理总局', '国家金融监督管理总局',
                   '国家广播电视总局',
                   '国家体育总局',
                   '国家统计局', '国家国际发展合作署', '国家医疗保障局', '国家机关事务管理局', '国家标准化管理委员会', '国家新闻出版署', '国家版权局', '国家互联网信息办公室',
                   '中国科学院',
                   '中国社会科学院', '中国工程院', '中国气象局', '中国银行保险监督管理委员会', '中国证券监督管理委员会', '国家粮食和物资储备局', '国家能源局', '国家国防科技工业局',
                   '国家烟草专卖局',
                   '国家移民管理局', '国家林业和草原局', '国家铁路局', '中国民用航空局', '国家邮政局', '国家文物局', '国家中医药管理局', '国家矿山安全监察局', '国家外汇管理局',
                   '国家药品监督管理局',
                   '国家知识产权局', '国家档案局', '国家保密局', '国家密码管理局', '国家宗教事务局', '国务院台湾事务办公室', '国家乡村振兴局', '国家电影局']

    for bmfl in result_list:
        try:
            pageNo = 0
            time.sleep(2)
            # 拼接url
            url_ = f'https://sousuo.www.gov.cn/search-gov/data?t=zhengcelibrary_bm&q=&timetype=timeqb&mintime=&maxtime=&sort=score&sortType=1&searchfield=title&puborg=&pcodeYear=&pcodeNum=&filetype=&p={pageNo}&n=20&inpro=&bmfl={bmfl}&dup=&orpro=&type=gwyzcwjk'
            try:
                # 请求结果为json格式
                resp = requests.get(url=url_, headers=headers, verify=False)
                resp_text = resp.text
                resp_json = json.loads(resp_text)
                content_list = resp_json['searchVO']['listVO']
                resp.close()
            except:
                continue
            for content_dict in content_list:
                href = content_dict['url']  # 详情页
                title = content_dict['title']  # 标题
                pub_code = content_dict['pcode']  # 发文字号
                try:
                    pub_time = int(content_dict['pubtime'] / 1000)  # 发布时间
                    pub_time1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(pub_time))
                except:
                    pub_time1 = ''
                try:
                    p_time = int(content_dict['ptime'] / 1000)  # 成文时间
                    pub_time2 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(p_time))
                except:
                    pub_time2 = ''
                pub_org = content_dict['puborg']  # 发文机关
                try:
                    child_type = content_dict['childtype']  # 主题分类
                except:
                    child_type = ''
                # 判断是否已经爬取过
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    resp = requests.get(url=href, headers=headers, verify=False)
                    resp.encoding = 'utf-8'
                    resp_text = resp.text
                    soup = BeautifulSoup(resp_text, 'html.parser')
                    time.sleep(1)
                    content = str(soup.find('div', attrs={'class': 'pages_content mhide'}))
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = href.split('content')[0] + fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    resp.close()
                    result_dict = {
                        '标题': title,
                        '来源': '',
                        '发文机关': pub_org,
                        '发文字号': pub_code,
                        '内容-未去标签': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time1,
                        '成文时间': pub_time2,
                        '主题分类': child_type,
                        '网址': href,
                        '归属': bmfl,
                        '信息来源': '国务院部门文件',
                        'tid': 1699,
                    }
                    print(title)
                    save_data(result_dict)
                    num += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据，耗时{end_time - start_time}')


# 国务院国有资产监督管理委员会-政策发布
def get_content3():
    start_time = time.time()
    num = 0
    url = "http://www.sasac.gov.cn/n2588035/n2588320/n2588335/index.html"
    try:
        # get请求,需要取消ssl验证
        href_resp = requests.request("GET", url, headers=headers, verify=False)
        resp_text = href_resp.content.decode('UTF-8')
        doc_resp = pq(resp_text)
        doc_items = doc_resp('.zsy_conlist li').items()
        time.sleep(1)
        for doc_item in doc_items:
            # 获取所需数据
            try:
                href_ = doc_item('a').attr('href')
                if href_ is None:
                    continue
                href = f'http://www.sasac.gov.cn{href_.replace("../../..", "")}'
                # 判断是否已经爬取过
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                title = doc_item('a').attr('title')
                pub_time = doc_item('span').text().replace('[', '').replace(']', '')
            except:
                continue
            try:
                try:
                    resp_href = requests.request("GET", href, headers=headers, verify=False)
                    doc_href = pq(resp_href.content)
                    time.sleep(1)
                    content_html = str(doc_href('.zsy_comain').remove('style').remove('#qr_container'))
                    content = pq(content_html).text()
                except:
                    continue
                if content.strip() == '':
                    continue
                try:
                    org_content = doc_href('.zsy_cotitle').text()
                    org = re.findall('文章来源：(.*?)发布时间：', org_content)[0].strip()
                except:
                    org = ''
                try:
                    resp_href.encoding = 'utf-8'
                    resp_text_ = BeautifulSoup(resp_href.text, 'html.parser')
                    zsy_comain = resp_text_.find('div', attrs={'class': 'zsy_comain'})
                    p_list = zsy_comain.findAll('p')
                    pub_hao = ''
                    for p in p_list:
                        p = str(p.text)
                        if '号' in p and '〔' in p and '〕' in p or '[' in p and ']' in p and '号' in p or '【' in p and '】' in p and '号' in p:
                            try:
                                pub_hao = p.split('日')[1].split('自')[0].strip().lstrip()
                            except:
                                pub_hao = p.strip().lstrip()
                            break
                except:
                    pub_hao = ''
                if len(pub_hao) > 45:
                    pub_hao = ''
                result_dict = {
                    '标题': title,
                    '来源': org,
                    '发文机关': '',
                    '发文字号': pub_hao,
                    '内容-未去标签': content_html,
                    '附件网址': [],
                    '发布时间': pub_time,
                    '成文时间': '',
                    '主题分类': '',
                    '网址': href,
                    '归属': '国务院国资委',
                    '信息来源': '国务院国资委',
                    'tid': 1642,
                }
                save_data(result_dict)
                print(title)
                num += 1
            except:
                pass
    except:
        pass
    end_time = time.time()
    print(f'共抓取{num}条数据，耗时{end_time - start_time}')

from bs4 import BeautifulSoup
from urllib.parse import urljoin
# 将html中的相对地址转换成绝对地址
def paserUrl(html,listurl):
    # soup = BeautifulSoup(html, 'html.parser')
    # 获取所有的<a>标签和<img>标签
    links = html.find_all(['a', 'img'])
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(listurl, link['href'])
        elif 'src' in link.attrs:
            link['src'] = urljoin(listurl, link['src'])
    return html


# 北京
def bei_jing():
    id_list = []
    num = 0
    start_time = time.time()
    # 有反爬需要使用selenium
    # service = Service(r'D:/chrome/113/chromedriver.exe')
    # 配置selenium
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_experimental_option(
        "excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')

    chrome_options.add_argument(
        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
    # bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
    chrome_options.binary_location = r'D:/fbs_spider/Google/Chrome/Application/chrome.exe'
    chromedriver = r'D:/fbs_spider/cmd100/chromedriver.exe'
    bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver)
    with open('../../base/stealth.min.js') as f:
        js = f.read()

    bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": js
    })
    url = 'http://gzw.beijing.gov.cn/xxfb/zcfg/index.html'
    hrefs = []
    try:
        bro.get(url)
        time.sleep(2)
        bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
        time.sleep(1)
        while True:
            # 获取所有要爬取页面的url
            ul = bro.find_element(By.CLASS_NAME, 'public_list_team')
            li_list = ul.find_elements(By.TAG_NAME, 'li')
            for li in li_list:
                href_ = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
                title_ = li.find_element(By.TAG_NAME, 'a').get_attribute('title')
                hrefs.append([href_, title_])
            updown = bro.find_element(By.CLASS_NAME, 'fanye').find_elements(By.TAG_NAME, 'a')[-1]
            if updown.get_attribute('title') != '下一页':
                break
            updown.click()
            time.sleep(2)
        for href in hrefs[4:6]:
            title = href[1]
            #todo:测试需要 注释掉判重
            # 判断是否已经爬取过
            # is_href = db_storage.find_one({'网址': href[0]})
            # if is_href:
            #     continue
            # 对获取信息页面发送请求
            bro.get(href[0])
            time.sleep(1)
            # 获取所要信息
            pub = bro.find_element(By.CLASS_NAME, 'doc-info')
            pub_time = str(pub.text).split('[发布日期] ')[1].split('[有效性] ')[0].strip().lstrip()
            pub_source = str(pub.text).split('[发文机构] ')[1].split('[联合发文单位] ')[0].split('[实施日期] ')[0].strip().lstrip()
            pub_hao = pub.find_element(By.CLASS_NAME, 'fwzh').text.replace('[发文字号] ', '').lstrip().strip()
            if '号' not in pub_hao:
                pub_hao = ''
            cont = bro.find_element(By.ID, 'div_zhengwen').get_attribute('innerHTML')

            soup_cont = BeautifulSoup(cont,'lxml')

            soup = paserUrl(soup_cont, href)
            text = str(soup.prettify())
            print(text)
            # print(title)
            num = 0

            fu_jian_soup = soup.find_all('a')
            for file in fu_jian_soup:
                num+=1
                file_href = file['href']
                if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xls' in file_href or '.zip' in file_href \
                        or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                        or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                    file_name = file.text.strip()
                    retData = baseCore.uploadToserver(file_href,'1667')
                    if retData['state']:
                        pass
                    else:
                        continue
                    att_id = baseCore.tableUpdate(retData,'北京市国资委',file_name,num)
                    id_list.append(att_id)

            id_ = redefid(id_list)
            #todo:替换完成之后，将附件上传至文件服务器
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            # todo:传kafka字段
            dic_news = {
                'attachmentIds': id_,
                'author': '',
                'content': str(soup_cont.text),
                'contentWithTag': str(soup_cont),
                'createDate': time_now,
                'deleteFlag': 0,
                'id': '',
                'labels': [{'relationId': "1667", 'relationName': "北京市国资委", 'labelMark': "policy"}],
                'origin': pub_source,
                'organ': pub_hao,
                'topicClassification': '',
                'issuedNumber': pub_hao,
                'publishDate': pub_time,
                'writtenDate': pub_time,
                'sid': '0987654321',
                'sourceAddress': '',
                'summary': '',
                'title': title
            }
            print(dic_news)
            # sendKafka(dic_news)
            # print(id)
            # id_list.append(id)
            num += 1
        bro.quit()
    except Exception as e:
        print(e)
        pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

# 内蒙古
def nei_meng_gu():
    id_list = []
    start = time.time()
    num = 0
    url = 'http://gzw.nmg.gov.cn/zfxxgk/zcfg/index.html'
    try:
        resp_text = requests.get(url=url, headers=headers, verify=False)
        resp_text.encoding = 'utf-8'
        html = resp_text.text
        soup = BeautifulSoup(html, 'html.parser')
        result = soup.find(class_='right_two')
        li_list = result.find_all(class_='font14wr')
        for a in li_list[:1]:
            a_text = str(a)
            real_href = 'https://gzw.nmg.gov.cn/zfxxgk' + a_text.split('href="..')[-1].split('" target="_blank')[0]
            # # 判断是否已经爬取过
            #todo:测试用 注释掉判重
            # is_href = db_storage.find_one({'网址': real_href})
            # if is_href:
            #     continue
            try:
                # 获取所需信息
                title = a_text.split('target="_blank">')[-1].split('</a>')[0]
                href_text = requests.get(url=real_href, headers=headers, verify=False)
                href_text.encoding = 'utf-8'
                i_html = href_text.text
                i_soup = BeautifulSoup(i_html, 'html.parser')
                i_result = i_soup.find('div', id='d_laiyuan')
                time_ = i_result.find_all('span')[0]
                time_ = str(time_)
                pub_time = time_.split('<span>')[1].split('</span>')[0].replace('发布时间：', '')
                source = i_result.find_all('span')[1]
                source = str(source)
                pub_source = source.split('<span>')[1].split('</span>')[0].replace('来源：', '')
                fwzh = i_soup.find_all('td')[7]
                pub_hao_result = re.findall('〔(.*?)〕', str(fwzh))
                if len(pub_hao_result) == 0:
                    pub_hao = ''
                else:
                    if '内' in str(fwzh):
                        pub_hao = str(fwzh).split('<td>')[1].split('</td>')[0]
                    else:
                        pub_hao = ''
                i_content = str(i_soup.find(class_='d_show'))
                if i_content:
                    content = i_content
                else:
                    i_content = str(i_soup.find(class_='view TRS_UEDITOR trs_paper_default'))
                    content = i_content

                fujian = i_soup.find_all(class_='ql_detailbro_right_qztp')
                fu_jian_result = re.findall('href="(.*?)"', str(fujian))
                fu_jian_href_list = []
                if len(fu_jian_result) > 0:
                    for fu_jian_re in fu_jian_result:
                        if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                            fu_jian_re = str(real_href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
                            fu_jian_href = fu_jian_re
                            fu_jian_href_list.append(fu_jian_href)
                #todo:附件需要上传文件服务器 type_id:7

                result_dict = {
                    '标题': title,
                    '来源': pub_source,
                    '号': pub_hao,
                    '内容': content,
                    '附件网址': fu_jian_href_list,
                    '发布时间': pub_time,
                    '网址': real_href,
                    '归属': '内蒙古自治区国资委',
                }
                print(title)
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                id = baseCore.getNextSeq()
                # todo:传kafka字段
                dic_news = {
                    'attachmentIds': "14,15,16",
                    'author': '',
                    'content': content,
                    'contentWithTag': content,
                    'createDate': time_now,
                    'deleteFlag': 0,
                    'id': id,
                    'labels':[{'relationId': "1669", 'relationName': "内蒙古自治区国资委", 'labelMark': "policy"}],
                    'origin': pub_source,
                    'organ': pub_hao,
                    'topicClassification': '',
                    'issuedNumber': pub_hao,
                    'publishDate': pub_time,
                    'writtenDate':pub_time,
                    'sid':'0987654321',
                    'sourceAddress':'',
                    'summary':'',
                    'title':title
                }
                sendKafka(dic_news)
                print(id)
                id_list.append(id)
                # save_data(result_dict)
                num = num + 1
                break
            except:
                pass
    except:
        pass
    print(id_list)
    end = time.time()
    print('共', num, '条', '...........', '共耗时', end - start, '秒')


# 吉林
def ji_lin():
    start = time.time()
    num = 0
    url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
    try:
        resp_text = requests.get(url=url, headers=headers, verify=False)
        resp_text.encoding = 'utf-8'
        html = resp_text.text
        soup = BeautifulSoup(html, 'html.parser')
        result = soup.find(class_='list ej_list')
        li_list = result.find_all('li')
        for a in li_list:
            a_text = str(a)
            href = a.find('a')['href']  # 网站链接
            if re.findall('http', href):
                real_href = href
            else:
                real_href = url + a_text.split('href=".')[-1].split('" target="_blank')[0]
            title = a.find('a').text.replace('\n', '')
            is_href = db_storage.find_one({'网址': real_href})
            if is_href:
                continue
            try:
                href_text = requests.get(url=real_href, headers=headers, verify=False)
                i_html = href_text.text.encode("ISO-8859-1")
                i_html = i_html.decode("utf-8")
                i_soup = BeautifulSoup(i_html, 'html.parser')
                # print(i_soup)
                try:
                    i_come = i_soup.find('span', class_='source')
                    i_time = i_soup.find('span', class_='time')
                    pub_come = i_come.text.split('.write(" ')[1].split('");')[0].strip()
                    pub_time = i_time.text.split('时间：')[1].strip()
                except:
                    i_come = i_soup.find('div', class_='zsy_cotitle')
                    i_time = i_soup.find('div', class_='zsy_cotitle')
                    if (i_come):
                        # pub_come = i_come.find('p')
                        pub_come = i_come.find('p').text.split('文章来源：')[1].split('发布时间：')[0].strip()
                        # print(pub_time)
                        pub_time = i_time.find('p').text.split('发布时间：')[1].strip()
                        # print(pub_come)
                    else:
                        pub = i_soup.find(class_='share')
                        pub_time = pub.find(class_='left').find('span', class_='time').text
                        pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源：')[1].strip()
                        print(pub_come)
                fu_jian_href_list = []
                i_content = i_soup.find(class_='zsy_comain')
                if i_content:
                    content = str(i_content)
                    fj = i_soup.find('div', style='width:920px; margin: 0 auto;')
                    if fj:
                        li_list = fj.find_all('li')
                        for li in li_list:
                            fu_jian_href = li.find('a')['href']
                            if 'http' in fu_jian_href:
                                fu_jian_href = fu_jian_href
                            else:
                                fu_jian_href = 'http://www.sasac.gov.cn' + fu_jian_href.replace('../', '')
                            fu_jian_href_list.append(fu_jian_href)
                else:
                    content = str(i_soup.find(class_="content"))
                    fu_jian_href_list = []
                if '扫一扫在手机打开当前页' in content:
                    content.replace('扫一扫在手机打开当前页', '')
                else:
                    pass
                result_dict = {
                    '标题': title,
                    '来源': pub_come,
                    '号': '',
                    '内容': content,
                    '附件网址': fu_jian_href_list,
                    '发布时间': pub_time,
                    '网址': real_href,
                    '归属': '吉林省国资委',
                }
                print(title)
                # print('............................................................')
                # 如果内容为空，则数据不传接口
                if content == '' or content == 'None':
                    continue
                else:
                    save_data(result_dict)
                    num = num + 1
            except:
                pass
    except:
        pass
    end = time.time()
    print('共', num, '条', '...........', '共耗时', end - start, '秒')


# 上海
def shang_hai():
    id_list = []
    start = time.time()
    num = 0
    for page in range(1, 7):
    # for page in range(1, 2):
        if page == 1:
            url = 'https://www.gzw.sh.gov.cn/shgzw_flfg_zcfg_gfxwj/index.html'
        else:
            url = f'https://www.gzw.sh.gov.cn/shgzw_flfg_zcfg_gfxwj/index_{page}.html'
        try:
            resp_text = requests.get(url=url, headers=headers, verify=False).text
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.gqzc_list_right ul li').items()
            for doc_item in doc_items:
                title = doc_item('a').attr('title').strip()
                pub_time = doc_item('span').text() + ' 00:00:00'
                href = doc_item('a').attr('href')
                if 'https:/' in href:
                    pass
                else:
                    href = 'https://www.gzw.sh.gov.cn' + href
                # is_href = db_storage.find_one({'网址': href})
                # if is_href:
                #     continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False).text
                    doc_href = pq(href_text)
                    doc_href_ = BeautifulSoup(href_text, 'html.parser')
                    content = str(doc_href_.find('div', attrs={'class': 'detail_03'}))
                    try:
                        pub_result = doc_href('.detail_03')
                        pub_result('meta')
                        pub_result = '沪' + str(pub_result('meta')).split('沪')[1].split('号')[0].strip() + '号'
                    except:
                        try:
                            pub_result = str(
                                '沪' + doc_href('.detail_03 ul').text().split('沪')[1].split('号')[0].strip() + '号')
                        except:
                            pub_result = str(doc_href('.detail_03 p').text().split('号')[0].strip() + '号')
                    if '﹝' in pub_result and '﹞' in pub_result:
                        pub_hao = pub_result.replace('﹝', '〔').replace('﹞', '〕')
                    elif '〔' in pub_result and '〕' in pub_result:
                        pub_hao = pub_result
                    elif '【' in pub_result and '】' in pub_result:
                        pub_hao = pub_result
                    elif '[' in pub_result and ']' in pub_result:
                        pub_hao = pub_result
                    else:
                        pub_hao = ''
                    if len(pub_hao) > 20:
                        pub_hao = ''
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = 'https://www.gzw.sh.gov.cn' + fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': '',
                        '号': pub_hao,
                        '内容': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '上海市国资委',
                    }
                    print(title)
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    id = baseCore.getNextSeq()
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': "14,15,16",
                        'author': '',
                        'content': content,
                        'contentWithTag': content,
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': id,
                        'labels': [{'relationId': "1671", 'relationName': "上海市国资委", 'labelMark': "policy"}],
                        'origin': '',
                        'organ': pub_hao,
                        'topicClassification': '',
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
                        'writtenDate': pub_time,
                        'sid': '002',
                        'sourceAddress': '',
                        'summary': '',
                        'title': title
                    }
                    sendKafka(dic_news)
                    print(id)
                    id_list.append(id)
                    # save_data(result_dict)
                    num = num + 1
                except:
                    pass
        except:
            pass
    print(id_list)
    end = time.time()
    print('共', num, '条', '...........', '共耗时', end - start, '秒')


# 浙江
def zhe_jiang():
    start = time.time()
    num = 0
    url = 'http://gzw.zj.gov.cn/col/col1229430928/index.html'
    try:
        res = requests.get(url, headers).content
        soup = BeautifulSoup(res, 'html.parser')
        # print(soup)
        # recordset = soup.find('recordset')
        list_li = re.findall('CDATA\[\\n(.*?)\]\]></record>', str(soup))
        # print(list_li)
        for li in list_li:
            fj_href_list = []
            li = BeautifulSoup(li, 'lxml')
            href = li.find('a')['href']
            pub_time = li.find('a').find('span').text
            title = li.find('a').text.replace(pub_time, '').strip()
            # print(title)
            if 'http' in href:
                href = href
            else:
                href = 'http://gzw.zj.gov.cn/' + href
            is_href = db_storage.find_one({'网址': href})
            if is_href:
                continue
            try:
                href_text = requests.get(url=href, headers=headers, verify=False)
                href_text.encoding = href_text.apparent_encoding
                i_html = href_text.text
                i_soup = BeautifulSoup(i_html, 'html.parser')
                # g_xxgk_table cf
                i_info = i_soup.find_all(class_='g_xxgk_td')
                if len(i_info) != 0:
                    try:
                        pub_source = str(i_info[4]).split('"g_xxgk_td">')[1].split('</div>')[0]
                        # pub_time = str(i_info[5]).split('"g_xxgk_td">')[1].split('</div>')[0]
                        pub_hao = str(i_info[2]).split('"g_xxgk_td">')[1].split('</div>')[0]
                        content = str(i_soup.find(class_='g_content'))
                    except:
                        # pub_source = str(i_info[3])
                        # print(pub_source)
                        pub_source = str(i_info[2]).split('"g_xxgk_td">')[1].split('</div>')[0]
                        # pub_time = str(i_info[3]).split('"g_xxgk_td">')[1].split('</div>')[0]
                        pub_hao = ''
                        content = str(i_soup.find(class_='g_content'))
                else:
                    try:
                        source = i_soup.find('span', class_='rich_media_meta rich_media_meta_nickname')
                        pub_source = source.find('a').text
                        time_ = i_soup.find('em', id='publish_time')
                        pub_time = time_.text
                        pub_hao = ''
                        content = str(i_soup.find(
                            class_='zh_CN wx_wap_page wx_wap_desktop_fontsize_2 mm_appmsg comment_feature discuss_tab appmsg_skin_default appmsg_style_default pages_skin_pc not_in_mm'))
                    except:
                        try:
                            source = i_soup.find_all(class_='ant-space-item')
                            # pub_time = str(source[1]).split('<span>')[1].split('</span>')[0]
                            pub_source = str(source[0]).split('<span>')[1].split('</span>')[0].replace('来源：', '')
                            pub_hao = ''
                            content = str(i_soup.find(class_='index_wrapper__L_zqV'))
                        except:
                            source = i_soup.find('div', class_='zsy_cotitle').find('p').text
                            pub_source = source.split('文章来源：')[1].split('发布时间：')[0]
                            pub_hao = ''
                            content = str(i_soup.find('div', class_='zsy_comain')).replace('扫一扫在手机打开当前页', '').strip()
                            fujian_list = i_soup.find(class_='related').find_all('li')
                            for fujian in fujian_list:
                                fujian_href = 'http://www.sasac.gov.cn/' + str(fujian.find('a')['href']).replace('../', '')
                                fj_href_list.append(fujian_href)
                result_dict = {
                    '标题': title,
                    '来源': pub_source,
                    '号': pub_hao,
                    '内容': content,
                    '附件网址': fj_href_list,
                    '发布时间': pub_time,
                    '网址': href,
                    '归属': '浙江省国资委',
                }
                print(title)
                save_data(result_dict)
                num = num + 1
            except:
                pass
    except:
        pass
    end = time.time()
    print('共', num, '条', '...........', '共耗时', end - start, '秒')


# 福建
def fu_jian():
    error_tag = str(404)
    num = 0
    start_time = time.time()
    url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
    try:
        resp_text = requests.get(url=url, headers=headers, verify=False)
        resp_text.encoding = 'utf-8'
        html = resp_text.text
        soup = BeautifulSoup(html, 'html.parser')
        # print(soup)
        result = soup.find_all(class_='borbot-line')
        for li_list in result:
            li = li_list.find_all('li')
            for a in li:
                # print(a)
                a_text = str(a)
                title = a_text.split('title="')[-1].split('">')[0].replace('\n', '')
                href_ = str(a.find('a').get('href'))  # 网站链接
                href = href_.replace('../', './').replace('./', 'http://gzw.fujian.gov.cn/zwgk/')
                href_text = requests.get(url=href, headers=headers, verify=False)
                href_text.encoding = href_text.apparent_encoding
                i_html = href_text.text
                i_soup = BeautifulSoup(i_html, 'html.parser')
                try:
                    error_ = str(i_soup.find('strong').text)
                except:
                    error_ = ''
                if error_ == error_tag:
                    href = href_.replace('../', './').replace('./', 'http://gzw.fujian.gov.cn/zwgk/zcfg/')
                    href_text = requests.get(url=href, headers=headers, verify=False)
                    href_text.encoding = href_text.apparent_encoding
                    i_html = href_text.text
                    i_soup = BeautifulSoup(i_html, 'html.parser')
                    try:
                        error_ = str(i_soup.find('strong').text)
                    except:
                        error_ = ''
                    if error_ == error_tag:
                        href = href_.replace('../../', 'http://gzw.fujian.gov.cn/')
                        href_text = requests.get(url=href, headers=headers, verify=False)
                        href_text.encoding = href_text.apparent_encoding
                        i_html = href_text.text
                        i_soup = BeautifulSoup(i_html, 'html.parser')
                real_href = href
                is_href = db_storage.find_one({'网址': real_href})
                if is_href:
                    continue
                try:
                    # 文章是远程pdf
                    if '.pdf' in real_href:
                        fu_jian_href_list = real_href
                        pub_hao = ''
                        pub_time = ''
                        pub_source = ''
                        content = '--'
                    else:
                        try:
                            href_text = requests.get(url=real_href, headers=headers, verify=False)
                            href_text.encoding = href_text.apparent_encoding
                            i_html = href_text.text
                            i_soup = BeautifulSoup(i_html, 'html.parser')
                            # print(i_soup)
                            source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
                            pub_source = source_.split('来源：')[1].split('发布时间：')[0].strip().lstrip()
                            pub_time = source_.split('发布时间：')[1].split('浏览量：')[0].strip().lstrip()
                            source = i_soup.find('div', attrs={'class': 'xl_con1'})
                            content = source
                            pub_hao = ''
                            fu_jian_result = re.findall('href="(.*?)"', str(content))
                            fu_jian_href_list = []
                            if len(fu_jian_result) > 0:
                                for fu_jian_re in fu_jian_result:
                                    if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                            or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                            or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                        fu_jian_href = fu_jian_re
                                        fu_jian_href_list.append(fu_jian_href)
                        except:
                            href_text = requests.get(url=real_href, headers=headers, verify=False)
                            href_text.encoding = href_text.apparent_encoding
                            i_html = href_text.text
                            i_soup = BeautifulSoup(i_html, 'html.parser')
                            # print(i_soup)
                            source = str(i_soup.find('table', attrs={'class': 'tp-pho'}).text)
                            pub_hao = source.split('文号')[1].split('发布机构')[0].strip().lstrip()
                            pub_source = source.split('发布机构')[1].split('生成日期')[0].strip().lstrip()
                            pub_time = source.split('生成日期')[1].split('标题')[0].strip().lstrip()
                            content = i_soup.find('div', attrs={'class': 'xl-article-nr'})
                            fu_jian_result = re.findall('href="(.*?)"', str(content))
                            fu_jian_href_list = []
                            if len(fu_jian_result) > 0:
                                for fu_jian_re in fu_jian_result:
                                    if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                            or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                            or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                        fu_jian_href = fu_jian_re
                                        fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': str(content),
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': real_href,
                        '归属': '福建省国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num += 1
                except:
                    pass
    except:
        pass
    end_time = time.time()
    print(f'共抓取{num}条数据，共耗时{end_time - start_time}')


# 山东
def shan_dong():
    headers = {
        'Cookie': 'COLLCK=2502513302; COLLCK=2493627587',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.183'
    }
    start = time.time()
    num = 0
    url_list = ['http://gzw.shandong.gov.cn/channels/ch06086/', 'http://gzw.shandong.gov.cn/channels/ch06088/']
    for url in url_list:
        try:
            resp_text = requests.get(url=url, headers=headers, verify=False)
            resp_text.encoding = 'utf-8'
            html = resp_text.text
            soup = BeautifulSoup(html, 'html.parser')
            result = soup.find_all(class_='pagedContent')
            for li in result:
                href = li.find('a')['href']
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
                    href_text.encoding = href_text.apparent_encoding
                    i_html = href_text.text
                    i_soup = BeautifulSoup(i_html, 'html.parser')
                    try:
                        source = i_soup.find_all('tbody')[0]
                        title = str(source).split('标　　题：</strong>')[1].split('</td>')[0].replace('\r', '').replace('\n', '')
                        pub_time = re.findall('<strong>发布日期：</strong>(.*?)</td>', str(source))
                        pub_time = ''.join(pub_time)
                        pub_hao = re.findall('<strong>发文字号：</strong>(.*?)</td>', str(source))
                        pub_hao = ''.join(pub_hao)
                        pub_source = re.findall('<strong>发文机关：</strong>(.*?)</td>', str(source))
                        pub_source = ''.join(pub_source)
                        # print(pub_time,pub_source,pub_hao)
                        content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
                        if pub_hao == '无':
                            p_list = content.find_all('p')
                            for p in p_list:
                                p_text = p.text
                                if '〔' and '〕' in p_text:
                                    pub_hao = p_text
                                    break
                                else:
                                    continue
                    except:
                        try:
                            title = str(i_soup.find('div', attrs={'class': 'wz_title'}).text).strip().lstrip()
                        except:
                            title = ''
                            source = i_soup.find('div', attrs={'id': 'nr'})
                            h1_list = source.find_all('h1')
                            for h1 in h1_list:
                                title = title + str(h1.text)
                            title.strip().lstrip()
                        pub_time = ''
                        span_list = source.find_all('span')
                        i = 0
                        for span in span_list:
                            span_text = span.text
                            if '〔' and '〕' in span_text or '鲁国' in span_text or '国办发' in span_text:
                                pub_hao = str(span_text)
                                if '号' not in pub_hao:
                                    pub_hao = pub_hao + str(span_list[i + 1].text)
                                break
                            i = i + 1
                        content = i_soup.find(class_="wz_zoom scroll_cont ScrollStyle")
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': str(content),
                        '附件网址': [],
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '山东省国资委',
                    }
                    if content == '' or content == 'None':
                        continue
                    else:
                        print(title)
                        save_data(result_dict)
                        num = num + 1
                except:
                    pass
        except:
            pass
    end = time.time()
    print('共', num, '条', '...........', '共耗时', end - start, '秒')


# 广东
def guang_dong():
    start = time.time()
    num = 0
    url = 'http://gzw.gd.gov.cn/zcfg/index.html'
    try:
        resp_href = requests.get(url=url, headers=headers, verify=False)
        resp_href.encoding = resp_href.apparent_encoding
        doc_resp = BeautifulSoup(resp_href.text, 'html.parser')
        page_items = str(doc_resp.find('div', attrs={'class': 'page'}).text)
        total = page_items.split('共 ')[1].split(' 条')[0].strip().lstrip()
        total = int(total)
        if total % 23 != 0:
            pagen = total / 23 + 1
        else:
            pagen = total / 23
        for page in range(1, int(pagen + 1)):
            if page == 1:
                url = 'http://gzw.gd.gov.cn/zcfg/index.html'
            else:
                url = f'http://gzw.gd.gov.cn/zcfg/index_{page}.html'
            resp_text = requests.get(url=url, headers=headers, verify=False).text
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.list li').items()
            for doc_item in doc_items:
                title = doc_item('a').text().replace('\n', '')
                href = doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False).text
                    doc_href = pq(href_text)
                    pub_result = doc_href('.title_info_sub').text()
                    pub_time = pub_result.split('文章来源：')[0].replace('发布时间：', '').strip() + ' 00:00:00'
                    pub_source = pub_result.split('文章来源：')[1].strip()
                    i_soup = BeautifulSoup(href_text, 'html.parser')
                    content = str(i_soup.find('div', attrs={'class', 'box_info'}))
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': '',
                        '内容': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '广东省国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num = num + 1
                except:
                    pass
    except:
        pass
    end = time.time()
    print('共', num, '条', '...........', '共耗时', end - start, '秒')


# 海南
def hai_nan():
    def hai_nan1():
        # 部门文件
        num = 0
        start_time = time.time()
        for page in range(13):
            if page == 0:
                url = "http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/"
            else:

                url = 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/' + '/' + 'index_{}.html'.format(page)
            try:
                resp_text = requests.get(url=url, headers=headers, verify=False).content
                doc_resp = pq(resp_text)
                doc_items = doc_resp('.list-right_title').items()
                for doc_item in doc_items:
                    pub_time = doc_item.next().text().replace('发布时间： ', '') + ' 00:00:00'
                    title = doc_item('a:nth-child(2)').text().strip()
                    href = doc_item('a:nth-child(2)').attr('href')
                    if '../../' in href:
                        href = 'http://gzw.hainan.gov.cn/zwgk_23509/' + href.replace('../../', '')
                    elif './' in href:
                        href = href.replace('./', 'http://gzw.hainan.gov.cn/zwgk_23509/zfwj/bmwj/')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        try:
                            href_text = requests.get(url=href, headers=headers, verify=False).content
                            doc_href = pq(href_text)
                            pub_result = doc_href('.xxgk-syxl-t1023.clear').remove('script').text().replace(' ',
                                                                                                            '').replace(
                                '　　', '')
                            pub_source = pub_result.split('发文机关：')[1].split('成文日期：')[0].strip()
                            pub_hao = pub_result.split('文号：')[1].split('发布日期：')[0].strip()
                            content = str(doc_href('.xxgk-syxl').children()).replace('扫一扫在手机打开当前页', '')
                            fu_jian_result = re.findall('href="(.*?)"', content)
                            # print(fu_jian_result)
                            fu_jian_href_list = []
                            if len(fu_jian_result) > 0:
                                for fu_jian_re in fu_jian_result:
                                    if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                            or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                            or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                        fu_jian_href = str(href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
                                        fu_jian_href_list.append(fu_jian_href)
                            # print(fu_jian_href_list)
                        except:
                            try:
                                resp = requests.get(url=href, headers=headers, verify=False)
                                resp.encoding = resp.apparent_encoding
                                resp_text = resp.text
                                source = BeautifulSoup(resp_text, 'html.parser')
                                tbody_text = str(source.find('tbody').text)
                                pub_source = tbody_text.split('发文机关：')[1].split('发文日期：')[0].strip().lstrip()
                                pub_hao = tbody_text.split('文　　号：')[1].split('主 题 词：')[0].strip().lstrip()
                                pub_time = tbody_text.split('发文日期：')[1].split('名　　称：')[0].strip().lstrip().replace('年',
                                                                                                                   '-').replace(
                                    '月', '-').replace('日', '')
                                content = str(source.find('div', attrs={'class': 'xly'})).replace('扫一扫在手机打开当前页', '')
                                fu_jian_result = re.findall('href="(.*?)"', content)
                                # print(fu_jian_result)
                                fu_jian_href_list = []
                                if len(fu_jian_result) > 0:
                                    for fu_jian_re in fu_jian_result:
                                        if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                                or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                                or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                            fu_jian_href = str(href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
                                            fu_jian_href_list.append(fu_jian_href)
                            except:
                                resp = requests.get(url=href, headers=headers, verify=False)
                                resp.encoding = resp.apparent_encoding
                                resp_text = resp.text
                                source = BeautifulSoup(resp_text, 'html.parser')
                                pub_time = str(source.find('div', attrs={'class': 'con_div'}).text).split('来源：')[
                                    0].lstrip().strip()
                                pub_source = \
                                    str(source.find('div', attrs={'class': 'con_div'}).text).split('来源：')[1].split(' 【字体：')[
                                        0].strip().lstrip()
                                pub_hao = ''
                                content = str(source.find('div', attrs={'class': 'TRS_UEDITOR'})).replace('扫一扫在手机打开当前页', '')
                                other_word = str(source.find('div', attrs={'class': 'other-word'}))
                                fu_jian_result = re.findall('href="(.*?)"', other_word)
                                # print(fu_jian_result)
                                fu_jian_href_list = []
                                if len(fu_jian_result) > 0:
                                    for fu_jian_re in fu_jian_result:
                                        if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                                or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                                or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                            fu_jian_href = str(href).split('/t')[0] + '/' + str(fu_jian_re).split('./')[1]
                                            fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': pub_hao,
                            '内容': content,
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '海南省国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num = num + 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def hai_nan2():
        def hai_nan_sw(page_href):
            num = 0
            req = requests.get(url=page_href, headers=headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.text, 'html.parser')
            doc_items = doc_resp.find_all(class_='list-right_title fon_1')
            for doc_item in doc_items:
                title = str(doc_item).split('target="_blank">')[1].split('</a>')[0]
                href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0]
                # print(title,href)
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
                    doc_href = BeautifulSoup(href_text.content, 'html.parser')
                    # print(doc_href)
                    pub_result = doc_href.find(class_='zwgk_comr1')
                    pub_result = pub_result.find_all('li')
                    # print(pub_result)
                    pub_source = str(pub_result[1]).split('</strong>')[1].split('</span>')[0]
                    pub_hao = str(pub_result[3]).split('文  号：</strong>')[1].split('</span>')[0].strip()
                    pub_time = str(pub_result[3]).split('发布日期：</strong>')[1].split('</span>')[0].strip()
                    content = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
                    fu_jian_result = re.findall('href="(.*?)"', str(content))
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = 'http://gzw.sc.gov.cn' + fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': str(content),
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '海南省国资委',
                    }
                    href_text.close()
                    save_data(result_dict)
                    print(title)
                    num += 1
                except:
                    pass
            req.close()
            return num

        def hai_nan_szf(page_href):
            num = 0
            req = requests.get(url=page_href, headers=headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.text, 'html.parser')
            doc_items = doc_resp.find_all(class_='list-right_title fon_1')
            for doc_item in doc_items:
                title = str(doc_item).split('target="_blank">')[1].split('</a>')[0].replace('\n', '')
                href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0]
                # print(title,href)
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
                    doc_href = BeautifulSoup(href_text.content, 'html.parser')
                    # print(doc_href)
                    try:
                        pub_result = doc_href.find(class_='zwgk_comr1')
                        pub_result = pub_result.find_all('li')
                        # print(pub_result)
                        pub_source = str(pub_result[1]).split('</strong>')[1].split('</span>')[0]
                        pub_hao = str(pub_result[3]).split('文  号：</strong>')[1].split('</span>')[0].strip()
                        pub_time = str(pub_result[3]).split('发布日期：</strong>')[1].split('</span>')[0].strip()
                        content = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
                    except:
                        pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
                        pub_source = str(pub_result.text).split('来源：')[1].split(' 【字体：')[0].lstrip().strip()
                        pub_time = str(pub_result.text).split('来源：')[0].lstrip().strip()
                        pub_hao = ''
                        content = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
                    fu_jian_result = re.findall('href="(.*?)"', str(content))
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = 'http://gzw.sc.gov.cn' + fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': str(content),
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '海南省国资委',
                    }
                    href_text.close()
                    save_data(result_dict)
                    print(title)
                    num += 1
                except:
                    pass
            req.close()
            return num

        def hai_nan_szfbgt(page_href):
            num = 0
            req = requests.get(url=page_href, headers=headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.text, 'html.parser')
            doc_items = doc_resp.find_all(class_='list-right_title fon_1')
            for doc_item in doc_items:
                title = str(doc_item).split('target="_blank">')[1].split('</a>')[0]
                href = 'https://www.hainan.gov.cn' + str(doc_item).split('href="')[1].split('" target')[0]
                # print(title,href)
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
                    doc_href = BeautifulSoup(href_text.content, 'html.parser')
                    # print(doc_href)
                    try:
                        pub_result = doc_href.find(class_='zwgk_comr1')
                        pub_result = pub_result.find_all('li')
                        # print(pub_result)
                        pub_source = str(pub_result[1]).split('</strong>')[1].split('</span>')[0]
                        try:
                            pub_hao = str(pub_result[3]).split('文  号：</strong>')[1].split('</span>')[0].strip()
                        except:
                            pub_hao = str(pub_result[3]).split('文       号：</strong>')[1].split('</span>')[0].strip()
                        pub_time = str(pub_result[3]).split('发布日期：</strong>')[1].split('</span>')[0].strip()
                        content = doc_href.find(class_='con_cen line mar-t2 xxgk_content_content')
                    except:
                        pub_result = doc_href.find('div', attrs={'class': 'line mar-t2 con_div'})
                        pub_source = str(pub_result.text).split('来源：')[1].split(' 【字体：')[0].lstrip().strip()
                        pub_time = str(pub_result.text).split('来源：')[0].lstrip().strip()
                        pub_hao = ''
                        content = doc_href.find('div', attrs={'class': 'xxgk_content_content'})
                    fu_jian_result = re.findall('href="(.*?)"', str(content))
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = 'http://gzw.sc.gov.cn' + fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': str(content),
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '海南省国资委',
                    }
                    href_text.close()
                    save_data(result_dict)
                    print(title)
                    num += 1
                except:
                    pass
            req.close()
            return num

        def hai_nan_zy(page_href):
            num = 0
            req = requests.get(url=page_href, headers=headers, verify=False)
            req.encoding = req.apparent_encoding
            doc_resp = BeautifulSoup(req.content, 'html.parser')
            list_div = doc_resp.find('div', attrs={'class': 'list list_1 list_2'})
            doc_items = list_div.find_all('li')
            for doc_item in doc_items:
                title = str(doc_item.find('a').text)
                i_href = doc_item.find('a').get('href')
                # https://www.gov.cn/zhengce/202307/content_6893055.htm
                if 'https://www.gov.cn/zhengce/' not in i_href:
                    i_href = str(i_href).replace('../../', 'https://www.gov.cn/zhengce/')
                try:
                    is_href = db_storage.find_one({'网址': i_href})
                    if is_href:
                        continue
                    if i_href == 'https://www.gov.cn/jrzg/2013-11/27/content_2536600.htm':
                        continue
                    if i_href == 'https://www.gov.cn/jrzg/2013-09/28/content_2497241.htm':
                        continue
                    href_text = requests.get(url=i_href, headers=headers, verify=False)
                    doc_href = BeautifulSoup(href_text.content, 'html.parser')
                    try:
                        pub_result = doc_href.find('table', class_='bd1').find_all('td')
                        pub_time = \
                            str(pub_result[13]).replace('年', '-').replace('月', '-').replace('日', '').split('<td>')[
                                1].split('</td>')[0]
                        pub_source = str(pub_result[5]).split('<td>')[1].split('</td>')[0]
                        pub_hao = str(pub_result[11]).split('<td>')[1].split('</td>')[0]
                        content = doc_href.find(class_='b12c')
                    except:
                        try:
                            pub_result = doc_href.find(class_='pages-date')
                            pub_source = pub_result.find('span', class_='font').text.replace('来源：', '').strip()
                            pub_time = str(pub_result).split('<span')[0].split('"pages-date">')[1].split('来源')[
                                0].strip()
                        except:
                            pub_source = ''
                            pub_time = ''
                        pub_hao = ''
                        content = doc_href.find(class_='pages_content')
                    fu_jian_result = re.findall('href="(.*?)"', str(content))
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = 'http://gzw.sc.gov.cn' + fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': str(content),
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': i_href,
                        '归属': '海南省国资委',
                    }
                    href_text.close()
                    save_data(result_dict)
                    print(title)
                    num += 1
                except:
                    pass
            req.close()
            return num

        def start():
            num = 0
            start_time = time.time()
            url = "https://www.hainan.gov.cn/hainan/qzcwj/zywj.shtml"
            try:
                req = requests.get(url=url, headers=headers, verify=False)
                req.encoding = req.apparent_encoding
                doc_resp = pq(req.text)
                doc_items = doc_resp('.nzcti').items()
                leibie_href_list = []
                for doc_item in doc_items:
                    # print(doc_item)
                    leibie = doc_item('a').text()
                    leibie_href = doc_item('a').attr('href')
                    if '更多' in leibie:
                        leibie = leibie.split('更多>> ')[1]
                        # print(leibie)
                        leibie_href = 'https://www.hainan.gov.cn' + doc_item('a').attr('href')
                    leibie_href_list.append(leibie_href)
                # 每一个类别的文件
                for url in leibie_href_list:
                    # 翻页
                    if url == leibie_href_list[0]:
                        max_page = 23
                        for page in range(max_page):
                            if max_page == 0:
                                page_href = str(url) + 'home.htm'
                            else:
                                page_href = str(url) + f'home_{page}.htm'
                            try:
                                num += hai_nan_zy(page_href)
                            except:
                                pass
                            time.sleep(1)
                    elif url == leibie_href_list[1]:
                        # https://www.hainan.gov.cn/hainan/swygwj/list3_2.shtml
                        max_page = 8
                        for page in range(max_page):
                            if page == 0:
                                page_href = 'https://www.hainan.gov.cn/hainan/swygwj/list3.shtml'
                            else:
                                page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
                            try:
                                num += hai_nan_sw(page_href)
                            except:
                                pass
                    elif url == leibie_href_list[2]:
                        max_page = 84
                        for page in range(max_page):
                            if page == 0:
                                page_href = 'https://www.hainan.gov.cn/hainan/szfwj/list3.shtml'
                            else:
                                page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
                            try:
                                num += hai_nan_szf(page_href)
                            except:
                                pass
                    else:
                        max_page = 84
                        for page in range(max_page):
                            if page == 0:
                                page_href = 'https://www.hainan.gov.cn/hainan/szfbgtwj/list3.shtml'
                            else:
                                page_href = str(url).split('list3')[0] + 'list3_{}.shtml'.format(page + 1)
                            try:
                                num += hai_nan_szfbgt(page_href)
                            except:
                                pass
            except:
                pass
            end_time = time.time()
            print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

        start()

    hai_nan1()
    hai_nan2()


# 四川
def si_chuan():
    num = 0
    start_time = time.time()
    for page in range(1, 3):
        if page == 1:
            url = 'http://gzw.sc.gov.cn/scsgzw/CU230401030101/cu_xxgk_list.shtml'
        else:
            url = 'http://gzw.sc.gov.cn/scsgzw/CU230401030101/cu_xxgk_list_2.shtml'
        try:
            resp_text = requests.get(url=url, headers=headers, verify=False).text
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.xxgkzd .pc').items()
            for doc_item in doc_items:
                # print(doc_item)
                pub_time = doc_item('span').text().strip() + ' 00:00:00'
                href = doc_item('a').attr('href')
                if 'http:' not in href:
                    href = 'http://gzw.sc.gov.cn' + doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False).text
                    doc_href = pq(href_text)
                    title = str(doc_href('.xxgkzn_title').text()).replace('\n', '').replace('\r', '')
                    content = str(doc_href('#scrollBox').children())
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = 'http://gzw.sc.gov.cn' + fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': '',
                        '号': '',
                        '内容': content,
                        '附件网址': fu_jian_result,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '四川省国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num = num + 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 广西
def guang_xi():
    num = 0
    start_time = time.time()
    url_all = """
    http://gzw.gxzf.gov.cn/wjzx/2023nwj/  1
    http://gzw.gxzf.gov.cn/wjzx/2022nwj/  1
    http://gzw.gxzf.gov.cn/wjzx/2021nwj/  1
    http://gzw.gxzf.gov.cn/wjzx/2020nwj/  1
    http://gzw.gxzf.gov.cn/wjzx/2019nwj/  1
    http://gzw.gxzf.gov.cn/wjzx/2018nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2017nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2016nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2015nwj/  3
    http://gzw.gxzf.gov.cn/wjzx/2014nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2013nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2012nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2011nwj/  5
    http://gzw.gxzf.gov.cn/wjzx/wjhbdej2008n2010n/  1
    http://gzw.gxzf.gov.cn/wjzx/wjhbdyj2004n2007n/  1
    http://gzw.gxzf.gov.cn/wjzx/gfxwjhb2004n2013n/  1
    http://gzw.gxzf.gov.cn/wjzx/jshgfxwj2004n2015n/  1
    http://gzw.gxzf.gov.cn/wjzx/gfxwjhb2004n2015n/  1
    """
    url_list = url_all.split('\n')
    for url_info in url_list[1:-1]:
        url_info = url_info.strip()
        url_1 = url_info.split(' ')[0].strip()
        for page in range(0, 1):
            if page == 0:
                url = f'{url_1}index.shtml'
            else:
                url = f'{url_1}index_{page}.shtml'
            try:
                resp_text = requests.get(url=url, headers=headers, verify=False).content
                doc_resp = pq(resp_text)
                doc_items = doc_resp('#morelist li').items()
                for doc_item in doc_items:
                    title = doc_item('a').attr('title').strip()
                    href = url.split('index')[0] + doc_item('a').attr('href').replace('./', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        href_text = requests.get(url=href, headers=headers, verify=False).content
                        doc_href = pq(href_text)
                        pub_result = doc_href('.article-inf-left').text()
                        pub_hao_result = doc_href('.article-h2').text()
                        if '﹝' in pub_hao_result and '﹞' in pub_hao_result:
                            pub_hao = pub_hao_result.replace('﹝', '〔').replace('﹞', '〕')
                        elif '〔' in pub_hao_result and '〕' in pub_hao_result:
                            pub_hao = pub_hao_result
                        else:
                            pub_hao = ''
                        pub_time = pub_result.split('来源：')[0].strip() + ':00'
                        try:
                            pub_source = pub_result.split('来源：')[1].split('作者：')[0].strip()
                        except:
                            pub_source = pub_result.split('来源：')[1].strip()
                        content = str(doc_href('.article-con div:first-child'))
                        fu_jian_result = re.findall('href="(.*?)"', content)
                        fu_jian_href_list = []
                        if len(fu_jian_result) > 0:
                            for fu_jian_re in fu_jian_result:
                                if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                        or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                        or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                    fu_jian_href = href.split('/t')[0] + '/' + fu_jian_re.replace('./', '')
                                    fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': pub_hao,
                            '内容': content,
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '广西壮族自治区国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num = num + 1
                    except:
                        pass
            except:
                pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 贵州
def gui_zhou():
    """
    http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/  11
    http://gzw.guizhou.gov.cn/zwgk/xxgkml/qlqdhzrqd/  1
    """
    num = 0
    start_time = time.time()
    for page in range(0, 11):
        if page == 0:
            url = 'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/alist.html'
        else:
            url = f'http://gzw.guizhou.gov.cn/zwgk/xxgkml/zcwj/alist_{page}.html'
        try:
            resp_text = requests.get(url=url, headers=headers, verify=False).content
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.c').items()
            for doc_item in doc_items:
                href = doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    title = doc_item('a').text().strip()
                    href_text = requests.get(url=href, headers=headers, verify=False)
                    if '404 Not Found' in href_text.text:
                        continue
                    doc_href = pq(href_text.content)
                    pub_result = doc_href('.xxgk_xl_top').text().replace('var str = ""; var str_1 = "', '').replace(
                        '"; if (str == "") { document.write(str_1); } else { document.write(str); }', '')
                    pub_time = pub_result.split('发文日期: ')[1].split('文号:')[0].strip().replace('年', '-').replace('月',
                                                                                                               '-').replace(
                        '日', ' ') + ' 00:00:00'
                    pub_source = pub_result.split('发布机构:')[1].split('发文日期:')[0].strip()
                    pub_hao = pub_result.split('文号:')[1].split('是否有效:')[0].strip()
                    if pub_source == '无':
                        pub_source = ''
                    if pub_hao == '无':
                        pub_hao = ''
                    content = str(doc_href('#Zoom').children())
                    img_url = href.split('/t')[0]
                    content = content.replace('src=".', f'src="{img_url}')
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = href.split('/t')[0] + '/' + fu_jian_re.replace('./', '')
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '贵州省国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num = num + 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 云南
def yun_nan():
    def yun_nan1():
        """
        http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz.shtml  9
        http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_list.shtml  1
        """
        num = 0
        start_time = time.time()
        for page in range(1, 6):
            if page == 1:
                # url = 'http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_gkgz.shtml'
                url = 'http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz.shtml'
            else:
                url = f'http://gzw.yn.gov.cn/yngzw/c100093/zfxxgk_gkgz_{page}.shtml'
            try:
                resp = requests.get(url=url, headers=headers, verify=False)
                doc_resp = pq(resp.content)
                doc_items = doc_resp('.gkgz_list_content li').items()
                for doc_item in doc_items:
                    href = doc_item('a').attr('href')
                    if 'http:' not in href:
                        href = 'http://gzw.yn.gov.cn' + doc_item('a').attr('href')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        fu_jian_href_list = []
                        if '.shtml' in href:
                            href_resp = requests.get(url=href, headers=headers, verify=False)
                            href_resp.encoding = href_resp.apparent_encoding
                            href_text = href_resp.text
                            doc_href = BeautifulSoup(href_text, 'html.parser')
                            title = doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content > h2')[
                                0].text.lstrip().strip()
                            pub_hao = \
                                str(doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content > p')[
                                        0].text).split('(')[1].split(')')[0].replace('\n', '')
                            content = doc_href.select('#gknbxq_container > div > div.zfxxgk-content.zfxxgk_content')[0]
                            fu_jian_result = re.findall('href="(.*?)"', str(content))
                            if len(fu_jian_result) > 0:
                                for fu_jian_re in fu_jian_result:
                                    if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                            or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                            or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                        try:
                                            fu_jian_href = href.replace('.shtml', '').replace('content_', '') + '/files' + \
                                                           fu_jian_re.split('/files')[1]
                                            fu_jian_href_list.append(fu_jian_href)
                                        except:
                                            continue
                            href_resp.close()
                        elif 'display' in href:
                            continue
                        else:
                            content = ''
                            pub_hao = ''
                            fu_jian_href_list.append(href)
                        result_dict = {
                            '标题': title,
                            '来源': '',
                            '号': pub_hao,
                            '内容': str(content),
                            '附件网址': fu_jian_href_list,
                            '发布时间': '',
                            '网址': href,
                            '归属': '云南省国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num = num + 1
                    except:
                        pass
                resp.close()
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def yun_nan2():
        num = 0
        start_time = time.time()
        for page in range(1, 4):
            if page == 1:
                # url = 'http://gzw.yn.gov.cn/yngzw/c100040/zfxxgk_gkgz.shtml'
                url = 'http://gzw.yn.gov.cn/yngzw/c100095/zfxxgk_list.shtml'
            else:
                url = f'http://gzw.yn.gov.cn/yngzw/c100095/zfxxgk_list_{page}.shtml'
            try:
                res = requests.get(url=url, headers=headers, verify=False)
                page_text = res.text.encode("ISO-8859-1")
                page_text = page_text.decode("utf-8")
                soup = BeautifulSoup(page_text, 'html.parser')
                li_list = soup.find('ul', attrs={'class': 'zfxxgk-nr-cnet'}).find_all('li')
                for li in li_list:
                    title = str(li.find('a').text).lstrip().strip()
                    pub_time = str(li.find('span').text).replace(' ', '').replace('\n', '')
                    href = 'http://gzw.yn.gov.cn' + li.find('a').get('href').replace(' ', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        fu_jian_href_list = []
                        if '.shtml' in href:
                            res_ = requests.get(href, headers)
                            page_text_ = res_.text.encode("ISO-8859-1")
                            page_text_ = page_text_.decode("utf-8")
                            page = BeautifulSoup(page_text_, 'html.parser')
                            pub_hao = ''
                            try:
                                pub_hao_list = page.find('p', attrs={'class': 'MsoNormal'}).findAll('span')
                                for a in pub_hao_list:
                                    pub_hao = pub_hao + str(a.text)
                                if '﹝' not in pub_hao and '﹞' not in pub_hao:
                                    pub_hao = ''
                            except:
                                pub_hao = ''
                            content = str(page.find('div', attrs={'class': 'zfxxgk-right'}))
                            fu_jian_result = re.findall('href="(.*?)"', content)
                            if len(fu_jian_result) > 0:
                                for fu_jian_re in fu_jian_result:
                                    if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                            or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                            or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                        try:
                                            fu_jian_href = href.replace('.shtml', '').replace('content_', '') + '/files' + \
                                                           fu_jian_re.split('/files')[1]
                                            fu_jian_href_list.append(fu_jian_href)
                                        except:
                                            continue
                            res_.close()
                        elif 'display' in href:
                            continue
                        else:
                            content = ''
                            pub_hao = ''
                            fu_jian_href_list.append(href)
                        result_dict = {
                            '标题': title,
                            '来源': '',
                            '号': pub_hao,
                            '内容': str(content),
                            '附件网址': fu_jian_href_list,
                            '发布时间': '',
                            '网址': href,
                            '归属': '云南省国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num = num + 1
                    except:
                        pass
                res.close()
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    yun_nan1()
    yun_nan2()


# 重庆
def chong_qing():
    """
    http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/xzgfxwj/  4
    http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/qtwj/  2
    """
    num = 0
    start_time = time.time()
    for page in range(0, 4):
        if page == 0:
            url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index.html'
        else:
            url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index_{}.html'.format(page)
        #     url = 'http://gzw.cq.gov.cn/zwgk_191/fdzdgknr/zcwj/zcwj/index_3.html'
        try:
            resp_text = requests.get(url=url, headers=headers, verify=False).content
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.zsj-fr-main').items()
            for doc_item in doc_items:
                titles = doc_item('a').items()
                for title_item in titles:
                    title = title_item.text().strip()
                    href = title_item('a').attr('href')
                    if '../' in href:
                        href = url.split('zcwj/index')[0] + title_item('a').attr('href').replace('../', '')
                    else:
                        href = url.split('index')[0] + title_item('a').attr('href').replace('./', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        href_text = requests.get(url=href, headers=headers, verify=False).content
                        doc_href = pq(href_text)
                        try:
                            pub_result = doc_href('.zwxl-table').text().replace(' ', '')
                            pub_time = pub_result.split('[发布日期]')[1].strip() + ' 00:00:00'
                            pub_hao = pub_result.split('[发文字号]')[1].split('[主题分类]')[0].strip()
                            pub_source = pub_result.split('[发布机构]')[1].split('[成文日期]')[0].strip()
                            content = str(doc_href('.view.TRS_UEDITOR.trs_paper_default.trs_word').children())
                            fujian = doc_href('.view.TRS_UEDITOR.trs_paper_default.trs_word').next()
                            fu_jian_result = re.findall('href="(.*?)"', str(fujian))
                        except:
                            pub_source = ''
                            pub_time = ''
                            pub_hao = ''
                            content = str(doc_href('.zwxl-content').children())
                            fu_jian_result = re.findall('href="(.*?)"', str(content))
                        fu_jian_href_list = []
                        if len(fu_jian_result) > 0:
                            for fu_jian_re in fu_jian_result:
                                if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                        or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                        or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                    fu_jian_href = href.split('/t')[0] + fu_jian_re.replace('.', '')
                                    fu_jian_href_list.append(fu_jian_href)
                        if content == '':
                            content = str(doc_href('.zwxl-article').remove('#div_div').remove('.bdsharebuttonbox').remove(
                                'script').children()).strip()
                            fu_jian_result = re.findall('href="(.*?)"', content)
                            fu_jian_href_list = []
                            if len(fu_jian_result) > 0:
                                for fu_jian_re in fu_jian_result:
                                    if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                            or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                            or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                        fu_jian_href = fu_jian_re
                                        fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': pub_hao,
                            '内容': content,
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '重庆市国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num += 1
                    except:
                        pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 天津
def tian_jin():
    def tian_jin1():
        num = 0
        start_time = time.time()
        for page in range(0, 2):
            if page == 0:
                url = 'http://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/'
            else:
                url = f'https://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/index_{page}.html'
            try:
                headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
                req = requests.get(url=url, headers=headers, verify=False)
                req_text = req.text.encode("ISO-8859-1")
                req_text = req_text.decode("utf-8")
                soup = BeautifulSoup(req_text, 'html.parser')
                doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
                li_list = doc_items.find_all('li')
                for li in li_list:
                    title = str(li.find('a').text).replace('\n', '').lstrip().strip()
                    i_href = str(li.find('a').get('href'))
                    if 'ZTZL' in i_href:
                        href = i_href.replace('../../../', 'https://sasac.tj.gov.cn/')
                    elif './' in i_href:
                        href = i_href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/sjwj/')
                    else:
                        href = i_href
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
                        doc_href = pq(href_text)
                        pub_result = doc_href('.property span').text()
                        if pub_result:
                            pub_time = pub_result.split('发布时间：')[1].strip() + ':00'
                            pub_source = pub_result.split('发布时间：')[0].split('文章来源：')[1].strip()
                            pub_hao = ''
                            content = str(doc_href('#zoom').children()).replace('<!-- 正文 -->', '').replace('<!-- 附件 -->',
                                                                                                           '').strip()
                            fu_jian_result = re.findall('href="(.*?)"', content)
                            fu_jian_href_list = []
                            if len(fu_jian_result) > 0:
                                for fu_jian_re in fu_jian_result:
                                    if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                            or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                            or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                        fu_jian_href = href.split('/t')[0] + fu_jian_re.replace('.', '')
                                        fu_jian_href_list.append(fu_jian_href)
                        else:
                            pub_result = doc_href('.sx-con').items()
                            pub_result_list = []
                            for pub in pub_result:
                                pub_result_list.append(pub)
                            pub_time = str(pub_result_list[6]).split('sx-con">')[1].split('</div>')[0]
                            pub_source = str(pub_result_list[2]).split('sx-con">')[1].split('</div>')[0]
                            pub_hao = str(pub_result_list[3]).split('sx-con">')[1].split('</div>')[0]
                            content = str(doc_href('.article_content'))
                            fu_jian_result = re.findall('href="(.*?)"', content)
                            fu_jian_href_list = []
                            if len(fu_jian_result) > 0:
                                for fu_jian_re in fu_jian_result:
                                    if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                            or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                            or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                        fu_jian_href = href.split('/t')[0] + fu_jian_re.replace('.', '')
                                        fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': pub_hao,
                            '内容': content,
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '天津市国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def tian_jin2():
        """
        http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html  4
        """
        num = 0
        start_time = time.time()
        for page in range(0, 5):
            if page == 0:
                url = 'http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index.html'
            else:
                url = f'http://sasac.tj.gov.cn/ZWGK1142/zcwj/wjwj/index_{page}.html'
            try:
                headers['Accept-Language'] = 'zh-CN,zh;q=0.9'
                req = requests.get(url=url, headers=headers, verify=False)
                req_text = req.text.encode("ISO-8859-1")
                req_text = req_text.decode("utf-8")
                soup = BeautifulSoup(req_text, 'html.parser')
                doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
                li_list = doc_items.find_all('li')
                for li in li_list:
                    title = str(li.find('a').text).replace('\n', '').lstrip().strip()
                    href = str(li.find('a').get('href'))
                    if 'http:' in href:
                        continue
                    else:
                        href = url.split('index')[0] + href.replace('./', '')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        href_text = requests.get(url=href, headers=headers, verify=False).content.decode('utf-8')
                        doc_href = pq(href_text)
                        pub_result = doc_href('.sx-item').text().replace(' ', '').replace('　', '')
                        pub_time = str(doc_href).split('发文日期：<p>')[1].split('</p></li>')[0] + ':00'
                        pub_source = pub_result.split('发布机构：')[1].split('发文字号：')[0].split(' ')[0].strip()
                        try:
                            pub_hao = pub_result.split('发文字号：')[1].split('主题：')[0].strip()
                        except:
                            pub_hao = ''
                        if len(pub_time) < 19:
                            pub_time = pub_time + ':00'
                        content = str(doc_href('.article_content.xl-zw-info').children()).strip()
                        fu_jian_result = re.findall('href="(.*?)"', content)
                        fu_jian_href_list = []
                        if len(fu_jian_result) > 0:
                            for fu_jian_re in fu_jian_result:
                                if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                        or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                        or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                    fu_jian_href = href.split('/t')[0] + '/' + fu_jian_re.replace('./', '')
                                    fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': pub_hao,
                            '内容': content,
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '天津市国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def tian_jin3():
        num = 0
        start_time = time.time()
        for page in range(1, 3):
            if page == 1:
                url = 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index.html'
            else:
                # https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index_1.html
                url = f'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/index_{page - 1}.html'
            try:
                req = requests.get(url, headers, verify=False)
                req_text = req.text.encode("ISO-8859-1")
                req_text = req_text.decode("utf-8")
                soup = BeautifulSoup(req_text, 'html.parser')
                doc_items = soup.select('#content > div.mainContent > div > div.mBd > ul')[0]
                li_list = doc_items.find_all('li')
                for li in li_list:
                    title = str(li.find('a').text).replace('\n', '').lstrip().strip()
                    href = str(li.find('a').get('href'))
                    try:
                        pub_time = li.find('div', attrs={'class': 'other'}).text
                    except:
                        pub_time = ''
                    if 'http' not in href:
                        if '../../../' in href:
                            href = href.replace('../../../', 'https://sasac.tj.gov.cn/')
                        href = href.replace('./', 'https://sasac.tj.gov.cn/ZWGK1142/zcwj/gjjwj/')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        if "www.gov.cn" in href:
                            res = requests.get(url=href, headers=headers)
                            page_text = res.text.encode("ISO-8859-1")
                            page_text = page_text.decode("utf-8")
                            page = BeautifulSoup(page_text, 'html.parser')
                            pub_hao = page.find('table', attrs={'class': 'bd1'}).findAll('tr')[3].findAll('td')[1].text
                            pub_source = page.find('table', attrs={'class': 'bd1'}).findAll('tr')[1].findAll('td')[1].text
                            content = str(page.find('table', attrs={'class': 'pages_content'})).replace("扫一扫在手机打开当前页", '')
                            fu_jain_result = re.findall('href="(.*?)"', content)
                        if "sasac.tj.gov.cn" in href:
                            res = requests.get(href, headers)
                            page_text = res.text.encode("ISO-8859-1")
                            page_text = page_text.decode("utf-8")
                            page = BeautifulSoup(page_text, 'html.parser')
                            try:
                                pub_hao = str(
                                    page.find('div', attrs={'class': 'common-content-subTitle'}).text).strip().lstrip()
                            except:
                                pub_hao = ''
                            if '号' not in pub_hao:
                                pub_hao = ''
                            pub_source = page.find('div', attrs={'class': 'property'}).find('span').text.replace('文章来源： ',
                                                                                                                 '')
                            content = str(page.find('div', attrs={'class': 'TRS_UEDITOR'}))
                            fu_jian_result = re.findall('href="(.*?)"', content)
                        fu_jian_href_list = []
                        if len(fu_jian_result) > 0:
                            for fu_jian_re in fu_jian_result:
                                if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                        or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                        or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                    fu_jian_href = fu_jian_re
                                    fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': pub_hao,
                            '内容': content,
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '天津市国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    tian_jin1()
    tian_jin2()
    tian_jin3()


# 新疆
def xin_jiang():
    def xin_jiang1():
        num = 0
        start_time = time.time()
        for page in range(1, 10):
            if page == 1:
                url = 'http://gzw.xinjiang.gov.cn/gzw/zcwj/list_tj.shtml'
            else:
                url = f'http://gzw.xinjiang.gov.cn/gzw/zcwj/list_tj_{page}.shtml'
            try:
                resp_text = requests.get(url=url, headers=headers, verify=False).content
                doc_resp = pq(resp_text)
                doc_items = doc_resp('.list.pt20 li').items()
                for doc_item in doc_items:
                    title = doc_item('a').attr('title').strip()
                    href = 'http://gzw.xinjiang.gov.cn' + doc_item('a').attr('href')
                    if '/gzw/zcwj/' not in href:
                        continue
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    #         href = 'http://gzw.xinjiang.gov.cn/gzw/zcwj/201909/559cf77b5a954d028bd187d6c6e46747.shtml'
                    try:
                        href_text = requests.get(url=href, headers=headers, verify=False)
                        href_text = href_text.text.encode("ISO-8859-1")
                        href_text = href_text.decode("utf-8")
                        doc_href = BeautifulSoup(href_text, 'html.parser')
                        pub_time = doc_href.find('span', attrs={'class', 'date'}).text.replace('日期：', '').replace('∶',
                                                                                                                  ':') + ':00'
                        pub_time = pub_time.strip()
                        pub_source = doc_href.find('span', attrs={'class', 'from'}).text.replace('来源：', '').strip()
                        content = str(doc_href.select('#NewsContent')[0])
                        fu_jian_href_list = []
                        if '.gif' in content:
                            fu_jian_result = re.findall('href="(.*?)"', content)
                            if len(fu_jian_result) > 0:
                                for fu_jian_re in fu_jian_result:
                                    if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                            or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                            or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                        fu_jian_href = href.replace('.shtml', '') + '/files' + fu_jian_re.split('/files')[1]
                                        fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': '',
                            '内容': content,
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '新疆维吾尔自治区国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num += 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def xin_jiang_jsbt():
        num = 0
        start_time = time.time()
        for page in range(1, 6):
            if page == 1:
                url = 'http://gyzc.xjbt.gov.cn/xxgk/zcfg/'
            else:
                url = f'http://gyzc.xjbt.gov.cn/xxgk/zcfg/index_{page}.shtml'
            try:
                resp_text = requests.get(url=url, headers=headers, verify=False)
                doc_resp = pq(resp_text.content)
                doc_items = doc_resp('.article').items()
                for doc_item in doc_items:
                    title = doc_item('a').text().strip()
                    pub_time = doc_item('.time.pull-right').text().strip() + ' 00:00:00'
                    href = doc_item('a').attr('href')
                    if 'http' not in href:
                        href = 'http://gyzc.xjbt.gov.cn' + href
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        href_res = requests.get(url=href, headers=headers, verify=False)
                        href_res.encoding = href_res.apparent_encoding
                        res_text = href_res.text
                        soup = BeautifulSoup(res_text, 'html.parser')
                        pub_result = soup.find('div', attrs={'class': 'title_info'}).text
                        pub_source = pub_result.split('信息来源：')[1].split('编辑：')[0].strip()
                        content = str(soup.find('div', attrs={'id': 'detail'}))
                        fu_jian_result = re.findall('href="(.*?)"', content)
                        fu_jian_href_list = []
                        if len(fu_jian_result) > 0:
                            for fu_jian_re in fu_jian_result:
                                if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                        or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                        or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                    fu_jian_href = fu_jian_re
                                    fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': '',
                            '内容': content,
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '新疆生产建设兵团国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num += 1
                        href_res.close()
                    except:
                        pass
                resp_text.close()
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    xin_jiang1()
    xin_jiang_jsbt()


# 山西
def shan_xi():
    num = 0
    start_time = time.time()
    for page in range(1, 7):
        if page == 1:
            url = 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/'
        else:
            url = f'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/index_{page - 1}.shtml'
        try:
            res = requests.get(url, headers)
            page_text = res.text.encode("ISO-8859-1")
            page_text = page_text.decode("utf-8")
            tree = etree.HTML(page_text)
            tr_list = tree.xpath(
                '/html/body/table[3]/tbody/tr/td[2]/table/tbody/tr[3]/td/table[2]/tbody/tr[3]/td/form/table/tbody/tr')
            for tr in tr_list:
                href = tr.xpath('./td[1]/a/@href')
                if href == []:
                    continue
                href = href[0].replace('../../', 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/').replace('./',
                                                                                                        'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/')
                title = tr.xpath('./td[1]/a/span//text()')[0]
                pub_time = tr.xpath('./td[2]/span/text()')[0]
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    if ".pdf" in href:
                        content = ''
                        pub_hao = ''
                        pub_time = ''
                        pub_source = ''
                        fu_jian_href_list = [href]
                    else:
                        res = requests.get(href, headers)
                        page_text = res.text.encode("ISO-8859-1")
                        page_text = page_text.decode("utf-8")
                        page = BeautifulSoup(page_text, 'html.parser')
                        content = str(page.select('#vsb_content')[0])
                        pub_hao = ''
                        pub_source = ''
                        fu_jian_result = re.findall('href="(.*?)"', content)
                        fu_jian_href_list = []
                        if len(fu_jian_result) > 0:
                            for fu_jian_re in fu_jian_result:
                                if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                        or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                        or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                    fu_jian_href = fu_jian_re
                                    fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '山西省国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 辽宁
def liao_ning():
    num = 0
    start_time = time.time()
    for page in range(1, 3):
        url = f'https://gzw.ln.gov.cn/gzw/xxgk/zc/zcfb/aa251549-{page}.shtml'
        try:
            resp_text = requests.get(url=url, headers=headers, verify=False)
            resp_text.encoding = resp_text.apparent_encoding
            resp_text = resp_text.text
            doc_resp = BeautifulSoup(resp_text, 'html.parser')
            doc_items = doc_resp.select(
                '#aa25154996104f57858a48e0b1aecca9 > div:nth-of-type(2) > div.tablist-show > div.tab-list-page')[0]
            li_list = doc_items.select('li')
            for li in li_list:
                # print(li)
                href = str(li.select('a')[0].get('href'))
                if 'http' not in href:
                    if 'https' not in href:
                        href = 'https://gzw.ln.gov.cn/' + href
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
                    href_text.encoding = href_text.apparent_encoding
                    href_text = href_text.text
                    doc_href = BeautifulSoup(href_text, 'html.parser')
                    try:
                        title = doc_href.find('p', attrs={'class', 'govxlTText'}).text.strip()
                        pub_result = doc_href.find('p', attrs={'class', 'govxlTText2'}).text.strip()
                        pub_time = pub_result.split('发布时间：')[1].strip().replace('年', '-').replace('月', '-').replace('日',
                                                                                                                    '') + ' 00:00:00'
                        pub_source = pub_result.split('文章来源：')[1].split('发布时间：')[0].strip()
                        pub_hao = ''
                        content = doc_href.find('div', attrs={'class': 'TRS_Editor'})
                    except:
                        title = str(doc_href.find('h1', attrs={'class', 'title'}).text).lstrip().strip()
                        pub_hao = str(doc_href.find('p', attrs={'class', 'wjh'}).text).lstrip().strip()
                        content = doc_href.find('div', attrs={'class': 'zfwj_detail'})
                        pub_time = ''
                        pub_source = ''
                    fu_jian_result = re.findall('href="(.*?)"', str(content))
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': str(content),
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '辽宁省国资委',
                    }
                    save_data(result_dict)
                    print(title)
                    num += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 黑龙江
def hei_long_jiang():
    num = 0
    start_time = time.time()
    for page in range(1, 3):
        url = f'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}'
        try:
            web = requests.get(url=url, headers=headers, verify=False)
            text = web.json()
            rows = text['data']['rows']
            try:
                for row in range(int(rows)):
                    result = text['data']['results'][row]
                    title = result['title']
                    href = 'http://gzw.hlj.gov.cn' + result['url']
                    pub_time = result['publishedTimeStr']
                    list_all = text['data']['results'][row]['domainMetaList'][1]['resultList'][0]
                    if list_all['name'] == '文号':
                        pub_hao = list_all['value']
                    else:
                        pub_hao = ''
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        content = text['data']['results'][row]['contentHtml']
                        href_text = requests.get(url=href, headers=headers, verify=False)
                        href_text.encoding = href_text.apparent_encoding
                        href_text = href_text.text
                        doc_href = BeautifulSoup(href_text, 'html.parser')
                        pub_source = doc_href.find(class_='ly')
                        if pub_source:
                            pub_source = pub_source.find('b').text
                        else:
                            pub_source = ''
                        fu_jian_result = re.findall('href="(.*?)"', content)
                        fu_jian_href_list = []
                        if len(fu_jian_result) > 0:
                            for fu_jian_re in fu_jian_result:
                                if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                        or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                        or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                    fu_jian_href = fu_jian_re
                                    fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': pub_hao,
                            '内容': content,
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '黑龙江省国资委',
                        }
                        save_data(result_dict)
                        print(title)
                        num += 1
                    except:
                        pass
            except:
                pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 江苏
def jiang_su():
    num = 0
    start_time = time.time()
    pagestart = 1
    pageend = 45
    for page in range(1, 3):
        url = f"http://jsgzw.jiangsu.gov.cn/module/web/jpage/dataproxy.jsp?startrecord={pagestart}&endrecord={pageend}&perpage=15"
        pagestart = pageend + 1
        pageend = pageend + 45
        payload = "col=1&appid=1&webid=39&path=%2F&columnid=85683&sourceContentType=1&unitid=369983&webname=%E6%B1%9F%E8%8B%8F%E7%9C%81%E5%9B%BD%E8%B5%84%E5%A7%94&permissiontype=0"
        header = {
            'Connection': 'keep-alive',
            'Accept': 'application/xml, text/xml, */*; q=0.01',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Origin': 'http://jsgzw.jiangsu.gov.cn',
            'Referer': 'http://jsgzw.jiangsu.gov.cn/col/col61490/index.html?uid=247686&pageNum=4',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cookie': 'JSESSIONID=ADB520E83E1FC10429D961634BAD303D; __jsluid_h=02c2c950abb71f547a79da79719246aa; _gscu_210493472=24936291qq5dvl18; _gscbrs_210493472=1; yunsuo_session_verify=60cc00825d4e2dd3dee278a301f60f1e; _gscs_210493472=24936291p77pyu18|pv:3'
        }
        try:
            resp_text = requests.request("POST", url, headers=header, data=payload).text
            li_list = re.findall('CDATA\[(.*?)\]\]></record>', str(resp_text))
            for li in li_list:
                a = BeautifulSoup(li, 'lxml').find('a')
                href = 'https://jsgzw.jiangsu.gov.cn/' + a['href']
                title = a.text
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    href_text = requests.get(url=href, headers=headers, verify=False)
                    href_text.encoding = href_text.apparent_encoding
                    href_text = href_text.text
                    doc_href = BeautifulSoup(href_text, 'html.parser')
                    try:
                        result = doc_href.find(class_='xlt_table').find_all('td')
                        pub_source = str(result[5]).split('"309">')[1].split('</td>')[0]
                        pub_time = str(result[3]).split('"309">')[1].split('</td>')[0]
                        pub_hao = str(result[7]).split('"309">')[1].split('</td>')[0]
                        content = str(doc_href.find(class_='con'))
                    except:
                        content = str(doc_href.find('div', attrs={'class': 'con'}))
                        pub_time = \
                            str(doc_href.find('div', attrs={'class': 'small-title'}).text).split('发布日期：')[1].split('浏览次数：')[
                                0].split(' ')[0].lstrip().strip()
                        pub_hao = ''
                        pub_source = ''
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = 'http://jsgzw.jiangsu.gov.cn' + fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '江苏省国资委',
                    }
                    save_data(result_dict)
                    print(title)
                    num += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 安徽
def an_hui():
    def an_hui1():
        num = 0
        start_time = time.time()
        for page in range(1, 4):
            url = f'http://gzw.ah.gov.cn/site/label/8888?IsAjax=1&dataType=html&_=0.4981381464472001&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=&catId=6717051&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
            try:
                resp_text = requests.get(url=url, headers=headers, verify=False).text
                doc_resp = pq(resp_text)
                doc_items = doc_resp('.info').items()
                for doc_item in doc_items:
                    title = doc_item('a').attr('title').strip()
                    href = doc_item('a').attr('href')
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        href_text = requests.get(url=href, headers=headers, verify=False)
                        href_text.encoding = href_text.apparent_encoding
                        soup = BeautifulSoup(href_text.text, 'html.parser')
                        try:
                            pub_result = soup.find('div', attrs={'class': 'div_table_suoyin'}).text
                            pub_time = soup.find('span', attrs={'class': 'fbsj'}).text.replace('发布日期：', '') + ':00'
                            pub_source = pub_result.split('发布机构：')[1].split('文号：')[0].strip()
                            pub_hao = pub_result.split('文号：')[1].split('名称：')[0].strip()
                            content = soup.find('div', attrs={'id': 'zoom'})
                        except:
                            content = soup.find('div', attrs={'class': 'zsy_content'})
                            pub_result = content.find('div', attrs={'class': 'zsy_cotitle'})
                            title = str(pub_result.text).split('文章来源：')[0].replace('\n', '').replace('\r',
                                                                                                     '').lstrip().strip()
                            pub_time = str(pub_result.text).split('发布时间：')[1].strip().lstrip()
                            pub_source = str(pub_result.text).split('文章来源：')[1].split('发布时间：')[0].lstrip().strip()
                            pub_hao = ''
                        fu_jian_result = re.findall('href="(.*?)"', str(content))
                        fu_jian_href_list = []
                        if len(fu_jian_result) > 0:
                            for fu_jian_re in fu_jian_result:
                                if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                        or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                        or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                    fu_jian_href = 'http://gzw.ah.gov.cn' + fu_jian_re
                                    fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': pub_hao,
                            '内容': str(content),
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '安徽省国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num = num + 1
                    except:
                        pass
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def an_hui2():
        num = 0
        start_time = time.time()
        for page in range(1, 25):
            url = f'http://gzw.ah.gov.cn/site/label/8888?_=0.5237800193505848&labelName=publicInfoList&siteId=6788071&pageSize=15&pageIndex={page}&action=list&isDate=false&dateFormat=yyyy%E5%B9%B4MM%E6%9C%88dd%E6%97%A5&length=15&organId=7031&type=4&catIds=43793891%2C43793901&catId=&cId=&result=&title=&fileNum=&keyWords=&file=%2Fxxgk%2FpublicInfoList_newest2020_zc'
            try:
                res = requests.get(url=url, headers=headers)
                res.encoding = res.apparent_encoding
                res_text = res.text
                soup = BeautifulSoup(res_text, 'html.parser')
                tr_list = soup.find_all('tr', attrs={'class': 'xxgk_nav_con'})
                for tr in tr_list:
                    title = tr.find('td', attrs={'class': 'info'}).find('a').text
                    href = tr.find('td', attrs={'class': 'info'}).find('a').get('href')
                    pub_hao = tr.find('td', attrs={'class': 'fwrq'}).text
                    pub_time = tr.find('td', attrs={'class': 'fbrq'}).text
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    try:
                        href_res = requests.get(url=href, headers=headers, verify=False)
                        href_res.encoding = href_res.apparent_encoding
                        href_text = href_res.text
                        doc_href = BeautifulSoup(href_text, 'html.parser')
                        try:
                            pub_source = str(doc_href.find(class_='res').text).split('：')[1]
                            content = doc_href.find(class_='gkwz_content')
                        except:
                            content = doc_href.find('div', attrs={'class': 'zsy_content'})
                            pub_result = content.find('div', attrs={'class': 'zsy_cotitle'})
                            pub_source = str(pub_result.text).split('文章来源：')[1].split('发布时间：')[0].lstrip().strip()
                        fu_jian_result = re.findall('href="(.*?)"', str(content))
                        fu_jian_href_list = []
                        if len(fu_jian_result) > 0:
                            for fu_jian_re in fu_jian_result:
                                if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                        or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                        or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                    fu_jian_href = 'http://gzw.ah.gov.cn' + fu_jian_re
                                    fu_jian_href_list.append(fu_jian_href)
                        result_dict = {
                            '标题': title,
                            '来源': pub_source,
                            '号': pub_hao,
                            '内容': str(content),
                            '附件网址': fu_jian_href_list,
                            '发布时间': pub_time,
                            '网址': href,
                            '归属': '安徽省国资委',
                        }
                        print(title)
                        save_data(result_dict)
                        num = num + 1
                        href_res.close()
                    except:
                        pass
                res.close()
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    an_hui1()
    an_hui2()


# 江西
def jiang_xi():
    """
    1-60
    61-120
    121-164
    """
    num = 0
    start_time = time.time()
    startrecord = 1
    endrecord = 60
    for page in range(1, 3):
        url = f"http://gzw.jiangxi.gov.cn/module/web/jpage/dataproxy.jsp?startrecord={startrecord}&endrecord={endrecord}&perpage=20"
        startrecord = endrecord + 1
        endrecord = endrecord + 60
        payload = "col=1&webid=175&path=http%3A%2F%2Fgzw.jiangxi.gov.cn%2F&columnid=22977&sourceContentType=1&unitid=402016&webname=%E6%B1%9F%E8%A5%BF%E7%9C%81%E5%9B%BD%E6%9C%89%E8%B5%84%E4%BA%A7%E7%9B%91%E7%9D%A3%E7%AE%A1%E7%90%86%E5%A7%94%E5%91%98%E4%BC%9A&permissiontype=0"
        header = {
            'Connection': 'keep-alive',
            'Accept': 'application/xml, text/xml, */*; q=0.01',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Origin': 'http://gzw.jiangxi.gov.cn',
            'Referer': 'http://gzw.jiangxi.gov.cn/col/col22977/index.html?uid=402016&pageNum=9',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cookie': 'JSESSIONID=F601A052571881210819664F5BD38015; JSESSIONID=6E54DB27D82E844B825DD675AE19E399'
        }
        try:
            resp_text = requests.request("POST", url, headers=header, data=payload).text
            href_list = re.findall("href='(.*?)'", resp_text)
            for href in href_list:
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    href_res = requests.get(url=href, headers=headers, verify=False)
                    href_res.encoding = href_res.apparent_encoding
                    href_text = href_res.text
                    soup = BeautifulSoup(href_text, 'html.parser')
                    pub_result = str(soup.find('div', attrs={'class': 'xxgk-quote'}).text)
                    title = pub_result.split('标？？？？？？题: ')[1].split('有？？效？？性: ')[0].lstrip().strip()
                    pub_source = pub_result.split('发文机关:')[1].split('文？？？？？？号:')[0].lstrip().strip()
                    pub_hao = pub_result.split('文？？？？？？号:')[1].split('主题分类: ')[0].lstrip().strip()
                    pub_time = pub_result.split('成文日期:')[1].split('标？？？？？？题: ')[0].lstrip().strip()
                    content = str(soup.select('#c')[0])
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = 'http://gzw.jiangxi.gov.cn' + fu_jian_re.replace('amp;', '')
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '江西省国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num = num + 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 河南
def he_nan():
    num = 0
    start_time = time.time()
    for page in range(0, 7):
        if page == 0:
            url = 'http://gzw.henan.gov.cn/xxgk/fdzdgknr/zcfg/index.html'
        else:
            url = f'http://gzw.henan.gov.cn/xxgk/fdzdgknr/zcfg/index_{page}.html'
        try:
            resp_text = requests.get(url=url, headers=headers, verify=False)
            doc_resp = pq(resp_text.content)
            doc_items = doc_resp('.mt15.list-box li').items()
            for doc_item in doc_items:
                title = doc_item('a').text().strip()
                href = doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                href_res = requests.get(url=href, headers=headers, verify=False)
                href_res.encoding = href_res.apparent_encoding
                href_text = href_res.text
                soup = BeautifulSoup(href_text, 'html.parser')
                pub_time = soup.select('#source')[0].text
                pub_source = soup.select('#pubDate')[0].text
                content = str(soup.select('#content')[0])
                fu_jian_result = re.findall('href="(.*?)"', content)
                fu_jian_href_list = []
                if len(fu_jian_result) > 0:
                    for fu_jian_re in fu_jian_result:
                        if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                            fu_jian_href = fu_jian_re
                            fu_jian_href_list.append(fu_jian_href)
                result_dict = {
                    '标题': title,
                    '来源': pub_source,
                    '号': '',
                    '内容': content,
                    '附件网址': fu_jian_href_list,
                    '发布时间': pub_time,
                    '网址': href,
                    '归属': '河南省国资委',
                }
                print(title)
                save_data(result_dict)
                num += 1
                href_res.close()
            resp_text.close()
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 湖南
def hu_nan():
    num = 0
    start_time = time.time()
    for page in range(1, 7):
        if page == 1:
            # http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index.html
            url = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index.html'
        else:
            url = f'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index_{page}.html'
        try:
            resp_text = requests.get(url=url, headers=headers, verify=False).content
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.table tbody tr').items()
            for doc_item in doc_items:
                href = 'http://gzw.hunan.gov.cn' + doc_item('a').attr('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                # href = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/201109/t20110920_1942364.html'
                try:
                    res = requests.get(url=href, headers=headers, verify=False)
                    res.encoding = res.apparent_encoding
                    res_text = res.text
                    soup = BeautifulSoup(res_text, 'html.parser')
                    pub_result = str(soup.find('div', attrs={'class': 'information-zt-list fn-clear'}).text)
                    pub_time = pub_result.split('发文日期：')[1].split('名称：')[0].strip() + ':00'
                    title = pub_result.split('名称：')[1].split('主题分类：')[0].lstrip().strip()
                    pub_source = pub_result.split('发布机构: ')[1].split('if(')[0].lstrip().strip()
                    content = str(soup.find('div', attrs={'class': 'information-zt-show'}))
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = href.split('/t')[0] + '/' + fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': '',
                        '内容': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '湖南省国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num = num + 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 甘肃
def gan_su():
    def gan_su1():
        num = 0
        start_time = time.time()
        # service = Service(r'D:/chrome/103/chromedriver.exe')
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_experimental_option(
            "excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
        chrome_options.add_argument(
            'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
        bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
        with open('./stealth.min.js') as f:
            js = f.read()

        bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": js
        })
        urls = ['http://gzw.gansu.gov.cn/gzw/c115543/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115544/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115545/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115546/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115547/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115548/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115549/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115550/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115551/xxgk_list.shtml',
                'http://gzw.gansu.gov.cn/gzw/c115554/xxgk_list.shtml']
        for url in urls:
            hrefs = []
            try:
                bro.get(url)
                time.sleep(2)
                bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
                time.sleep(1)
                ul = bro.find_element(By.CLASS_NAME, 'UlTab')
                li_list = ul.find_elements(By.TAG_NAME, 'li')
                for li in li_list:
                    href_ = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    hrefs.append(href_)
                for href in hrefs:
                    is_href = db_storage.find_one({'网址': href})
                    if is_href:
                        continue
                    bro.get(href)
                    time.sleep(1)
                    title = str(bro.find_element(By.CLASS_NAME, 'links_tit').text)
                    content = str(bro.find_element(By.CLASS_NAME, 'links_words').get_attribute('innerHTML'))
                    links_tab = str(bro.find_element(By.CLASS_NAME, 'links_tab').text)
                    pub_source = links_tab.split('发布机构')[1].split('主题分类')[0].replace('：', '').strip().lstrip()
                    pub_hao = links_tab.split('文号')[1].split('浏览次数')[0].replace('：', '').strip().lstrip()
                    pub_time = links_tab.split('生成日期')[1].split('关键字')[0].replace('：', '').strip().lstrip()
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '甘肃省国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num += 1
            except:
                pass
        bro.quit()
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def gan_su2():
        num = 0
        start_time = time.time()
        # service = Service(r'D:/chrome/103/chromedriver.exe')
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_experimental_option(
            "excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
        chrome_options.add_argument(
            'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
        bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
        with open('./stealth.min.js') as f:
            js = f.read()

        bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": js
        })
        url = 'http://gzw.gansu.gov.cn/gzw/c115552/xxgk_list.shtml'
        hrefs = []
        try:
            bro.get(url)
            time.sleep(2)
            bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            time.sleep(1)
            ul = bro.find_element(By.CLASS_NAME, 'UlTab')
            li_list = ul.find_elements(By.TAG_NAME, 'li')
            for li in li_list:
                href_ = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
                hrefs.append(href_)
            for href in hrefs:
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                bro.get(href)
                time.sleep(1)
                try:
                    pub = str(bro.find_element(By.XPATH, '/html/body/div[4]/div/div[2]/div[1]/table/tbody/tr/td').text)
                    title = pub.split('标　　题：')[1].split('发文字号：')[0].strip().lstrip()
                    pub_source = pub.split('发文机关：')[1].split('标　　题：')[0].strip().lstrip()
                    pub_hao = pub.split('发文字号：')[1].split('发布日期：')[0].strip().lstrip()
                    pub_time = pub.split('发布日期：')[1].strip().lstrip()
                    content = str(bro.find_element(By.CLASS_NAME, 'pages_content').get_attribute('innerHTML'))
                except:
                    try:
                        content_ = bro.find_element(By.CLASS_NAME, 'content')
                        content = str(content_.get_attribute('innerHTML'))
                        title = str(bro.find_element(By.TAG_NAME, 'h1').text).replace('\n', '')
                        pages_date = str(content_.find_element(By.CLASS_NAME, 'pages-date').text)
                        pub_time = pages_date.split(' 来源：')[0]
                        pub_source = pages_date.split(' 来源：')[1].split('\n')[0].strip()
                        pub_hao = ''
                    except:
                        try:
                            if bro.find_element(By.ID, 'pagination').click():
                                bro.find_element(By.ID, 'pagination').click()
                                bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
                            content_ = bro.find_element(By.CLASS_NAME, 'padd-zyygwj')
                            content = str(content_.get_attribute('innerHTML'))
                            title = str(content_.find_element(By.TAG_NAME, 'h1').text).replace('\n', '')
                            pages_date = str(content_.find_element(By.CLASS_NAME, 'pages-date').text)
                            pub_time = pages_date.split(' 来源：')[0]
                            pub_source = pages_date.split(' 来源：')[1].split('\n')[0].strip()
                            pub_hao = ''
                        except:
                            title = str(bro.find_element(By.CLASS_NAME, 'links_tit').text)
                            content = str(bro.find_element(By.CLASS_NAME, 'box_tab').get_attribute('innerHTML'))
                            links_tab = str(bro.find_element(By.CLASS_NAME, 'links_tab').text)
                            pub_source = links_tab.split('发布机构')[1].split('主题分类')[0].replace('：', '').strip().lstrip()
                            pub_hao = links_tab.split('文号')[1].split('浏览次数')[0].replace('：', '').strip().lstrip()
                            pub_time = links_tab.split('生成日期')[1].split('关键字')[0].replace('：', '').strip().lstrip()
                fu_jian_result = re.findall('href="(.*?)"', content)
                fu_jian_href_list = []
                if len(fu_jian_result) > 0:
                    for fu_jian_re in fu_jian_result:
                        if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                            fu_jian_href = fu_jian_re
                            fu_jian_href_list.append(fu_jian_href)
                result_dict = {
                    '标题': title,
                    '来源': pub_source,
                    '号': pub_hao,
                    '内容': content,
                    '附件网址': fu_jian_href_list,
                    '发布时间': pub_time,
                    '网址': href,
                    '归属': '甘肃省国资委',
                }
                print(title)
                save_data(result_dict)
                num += 1
        except:
            pass
        bro.quit()
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def gan_su3():
        num = 0
        start_time = time.time()
        # service = Service(r'D:/chrome/103/chromedriver.exe')
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_experimental_option(
            "excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument('lang=zh-CN,zh,zh-TW,en-US,en')
        chrome_options.add_argument(
            'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36')
        bro = webdriver.Chrome(chrome_options=chrome_options, executable_path=r'D:/chrome/103/chromedriver.exe')
        with open('./stealth.min.js') as f:
            js = f.read()

        bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": js
        })
        url = 'http://gzw.gansu.gov.cn/gzw/c115553/xxgk_list.shtml'
        hrefs = []
        try:
            bro.get(url)
            time.sleep(2)
            bro.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            while True:
                time.sleep(1)
                ul = bro.find_element(By.CLASS_NAME, 'UlTab')
                li_list = ul.find_elements(By.TAG_NAME, 'li')
                for li in li_list:
                    href_ = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
                    hrefs.append(href_)
                try:
                    bro.find_element(By.CLASS_NAME, 'nextpage').click()
                except:
                    break
            for href in hrefs:
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                bro.get(href)
                time.sleep(1)
                try:
                    content_ = bro.find_element(By.ID, 'detailContent')
                    content = str(content_.get_attribute('innerHTML'))
                    pub = str(bro.find_element(By.CLASS_NAME, 'contenttitle').text)
                    title = pub.split('标       题：')[1].split('发文字号：')[0].strip().lstrip()
                    pub_source = pub.split('发文机关：')[1].split('成文日期：')[0].strip().lstrip()
                    pub_hao = pub.split('发文字号：')[1].split('发布日期：')[0].strip().lstrip()
                    pub_time = pub.split('发布日期：')[1].split('主  题  词：')[0].strip().lstrip()
                except:
                    try:
                        content_ = bro.find_element(By.XPATH, '/html/body/div[7]/div')
                        content = str(content_.get_attribute('innerHTML'))
                        title = content_.find_element(By.TAG_NAME, 'h1').text
                        pub_time = str(content_.find_element(By.TAG_NAME, 'p').text).split('日期：')[1].split('来源：')[
                            0].lstrip()
                        pub_source = str(content_.find_element(By.TAG_NAME, 'p').text).split('来源：')[1].strip().lstrip()
                        pub_hao = ''
                    except:
                        title = str(bro.find_element(By.CLASS_NAME, 'links_tit').text)
                        content = str(bro.find_element(By.CLASS_NAME, 'box_tab').get_attribute('innerHTML'))
                        links_tab = str(bro.find_element(By.CLASS_NAME, 'links_tab').text)
                        pub_source = links_tab.split('发布机构')[1].split('主题分类')[0].replace('：', '').strip().lstrip()
                        pub_hao = links_tab.split('文号')[1].split('浏览次数')[0].replace('：', '').strip().lstrip()
                fu_jian_result = re.findall('href="(.*?)"', content)
                fu_jian_href_list = []
                if len(fu_jian_result) > 0:
                    for fu_jian_re in fu_jian_result:
                        if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                            fu_jian_href = fu_jian_re
                            fu_jian_href_list.append(fu_jian_href)
                result_dict = {
                    '标题': title,
                    '来源': pub_source,
                    '号': pub_hao,
                    '内容': content,
                    '附件网址': fu_jian_href_list,
                    '发布时间': pub_time,
                    '网址': href,
                    '归属': '甘肃省国资委',
                }
                print(title)
                save_data(result_dict)
                num += 1
        except:
            pass
        bro.quit()
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    gan_su1()
    gan_su2()
    gan_su3()


# 宁夏
def ning_xia():
    num = 0
    start_time = time.time()
    for page in range(0, 3):
        if page == 0:
            url = 'http://gzw.nx.gov.cn/zcfg/zcwj/gzwwj/index.html'
        else:
            url = f'http://gzw.nx.gov.cn/zcfg/zcwj/gzwwj/index_{page}.html'
        try:
            res = requests.get(url=url, headers=headers, verify=False)
            res.encoding = res.apparent_encoding
            res_text = res.text
            soup = BeautifulSoup(res_text, 'html.parser')
            li_list = soup.find('div', attrs={'class': 'stdnewslist'}).find_all('li')
            for li in li_list:
                title = li.find('a').get('title').replace('</p>', '').replace('<p>', '')
                href = url.split('index')[0] + li.find('a').get('href').replace('./', '')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    href_res = requests.get(url=href, headers=headers, verify=False)
                    href_res.encoding = href_res.apparent_encoding
                    href_text = href_res.text
                    soup_ = BeautifulSoup(href_text, 'html.parser')
                    pub_result = soup_.find('table', attrs={'class': 'gk-xl-table'}).text.replace(' ', '')
                    pub_time = pub_result.split('生成日期')[1].split('发文字号')[0].strip() + ' 00:00:00'
                    pub_hao = pub_result.split('发文字号')[1].split('公开形式')[0].strip()
                    pub_source = pub_result.split('所属机构')[1].split('有效性')[0].strip()
                    content = soup_.find('div', attrs={'class': 'content'}).find('div', attrs={'class': 'TRS_UEDITOR'})
                    fu_jian_result = re.findall('href="(.*?)"', str(content))
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = fu_jian_re
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': str(content),
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '宁夏回族自治区国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 陕西
def shanxi():
    num = 0
    start_time = time.time()
    url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
    # url = 'https://sxgz.shaanxi.gov.cn/newstyle/pub_newschannel.asp?chid=100127'
    try:
        res = requests.get(url=url, headers=headers)
        res.encoding = res.apparent_encoding
        res_text = res.text
        soup = BeautifulSoup(res_text, 'html.parser')
        # print(soup)
        result = soup.find(class_='scroll_cont')
        li_list = result.find_all('li')
        for li in li_list:
            href = li.find('a')['href']
            if 'http' in str(href):
                href = href
            else:
                href = 'https://sxgz.shaanxi.gov.cn/' + href
            is_href = db_storage.find_one({'网址': href})
            if is_href:
                continue
            try:
                res_href = requests.get(url=href, headers=headers)
                res_href.encoding = res_href.apparent_encoding
                res_text = res_href.text
                i_soup = BeautifulSoup(res_text, 'html.parser')
                title = i_soup.find(class_='m-gk-title').text
                i_result = i_soup.find(class_='ftitle')
                span_list = i_result.find_all('span')
                pub_source = str(span_list[0]).split('<span>')[1].split('</span>')[0]
                pub_time = str(span_list[2]).split('<span>')[1].split('</span>')[0]
                result_content = i_soup.find(class_='scroll_cont')
                div_list = result_content.find_all('div')
                content = str(div_list[0])
                result_dict = {
                    '标题': title,
                    '来源': pub_source,
                    '号': '',
                    '内容': content,
                    '附件网址': [],
                    '发布时间': pub_time,
                    '网址': href,
                    '归属': '陕西省国资委',
                }
                print(title)
                save_data(result_dict)
                num += 1
                res_href.close()
            except:
                pass
        res.close()
    except:
        pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 西藏
def xi_zang():
    num = 0
    start_time = time.time()
    url_list = ['http://gzw.lasa.gov.cn/gzw/zccfg/common_list.shtml',
                'http://gzw.lasa.gov.cn/gzw/wjzl/common_list.shtml', ]
    for url in url_list:
        try:
            res = requests.get(url=url, headers=headers)
            res.encoding = res.apparent_encoding
            res_text = res.text
            soup = BeautifulSoup(res_text, 'html.parser')
            result = soup.find('ul', class_='list')
            li_list = result.find_all('li')
            for li in li_list:
                href = li.find('a')['href']
                title = li.find('a').text
                if 'http' in str(href):
                    href = href
                else:
                    href = 'http://gzw.lasa.gov.cn' + href
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    res_href = requests.get(url=href, headers=headers)
                    res_href.encoding = res_href.apparent_encoding
                    res_href = res_href.text
                    i_soup = BeautifulSoup(res_href, 'html.parser')
                    i_result = i_soup.find(class_='inform')
                    div_list = i_result.find_all('div')
                    pub_time = str(div_list[0]).split('<div>')[1].split('</div>')[0].replace('发布时间：', '')
                    pub_source = str(div_list[1]).split('<div>')[1].split('</div>')[0].replace('来源：', '')
                    content = str(i_soup.find(id='NewsContent'))
                    fu_jian_result = re.findall('href="(.*?)"', content)
                    fu_jian_href_list = []
                    if len(fu_jian_result) > 0:
                        for fu_jian_re in fu_jian_result:
                            if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                    or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                    or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                fu_jian_href = 'http://gzw.jiangxi.gov.cn' + fu_jian_re.replace('amp;', '')
                                fu_jian_href_list.append(fu_jian_href)
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': '',
                        '内容': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '西藏拉萨市国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 青海
def qing_hai():
    def qing_hai1():
        num = 0
        start_time = time.time()
        url_mode = 'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=604'
        try:
            res = requests.get(url=url_mode, headers=headers)
            res.encoding = res.apparent_encoding
            res_text = res.text
            page = BeautifulSoup(res_text, 'html.parser')
            tr_list = page.findAll('tr', attrs={'class': 'lxzd'})
            for tr in tr_list:
                href = 'http://gxgz.qinghai.gov.cn' + tr.find('a').get('href')
                is_href = db_storage.find_one({'网址': href})
                if is_href:
                    continue
                try:
                    pub_hao = tr.findAll('td')[2].text
                    pub_time = tr.findAll('td')[-2].text
                    # print(href,pub_time,pub_hao,cw_time)
                    if '.html' in href:
                        res_ = requests.get(url=href, headers=headers)
                        res_.encoding = res_.apparent_encoding
                        res_text_ = res_.text
                        page = BeautifulSoup(res_text_, 'html.parser')
                        title = str(page.find('div', attrs={'class': 'con-title'}).text).strip().lstrip()
                        content = str(page.find('div', attrs={'class': 'nxgz-detail-con'}))
                        pub_source = ''
                        # print(content)
                        fu_jian_href_list = []
                        fu_jian_result = re.findall('href="(.*?)"', content)
                        if len(fu_jian_result) > 0:
                            for fu_jian_re in fu_jian_result:
                                if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                        or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                        or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                    try:
                                        fu_jian_href = href.replace('.shtml', '').replace('content_', '') + '/files' + \
                                                       fu_jian_re.split('/files')[1]
                                        fu_jian_href_list.append(fu_jian_href)
                                    except:
                                        continue
                        res_.close()
                    else:
                        title = str(tr.find('a').text).replace(u'\xa0', u' ').replace(' ', '')
                        pub_source = ''
                        content = ''
                        fu_jian_href_list = []
                    result_dict = {
                        '标题': title,
                        '来源': pub_source,
                        '号': pub_hao,
                        '内容': content,
                        '附件网址': fu_jian_href_list,
                        '发布时间': pub_time,
                        '网址': href,
                        '归属': '青海省国资委',
                    }
                    print(title)
                    save_data(result_dict)
                    num += 1
                except:
                    pass
        except:
            pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    def qing_hai2():
        num = 0
        start_time = time.time()
        urls = [
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=627',
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=559',
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=64',
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=65',
            'http://gxgz.qinghai.gov.cn/xxgk/List_wj.aspx?lmid=558', ]
        for url_mode in urls:
            # print(url_mode)
            try:
                res = requests.get(url=url_mode, headers=headers)
                res.encoding = res.apparent_encoding
                res_text = res.text
                page = BeautifulSoup(res_text, 'html.parser')
                page_numbers = \
                    str(page.find('div', attrs={'class': 'pages-pages'}).text).split()[0].split('共')[-1].split('页')[0]
                page_numbers = int(page_numbers)
                for page_number in range(1, page_numbers + 1):
                    url = url_mode + f'&pages={page_number}'
                    # print(url + '................................')
                    res = requests.get(url=url, headers=headers)
                    res.encoding = res.apparent_encoding
                    res_text = res.text
                    page = BeautifulSoup(res_text, 'html.parser')
                    tr_list = page.findAll('tr', attrs={'class': 'lxzd'})
                    for tr in tr_list:
                        href = 'http://gxgz.qinghai.gov.cn' + tr.find('a').get('href')
                        is_href = db_storage.find_one({'网址': href})
                        if is_href:
                            continue
                        try:
                            pub_hao = tr.findAll('td')[2].text
                            cw_time = tr.findAll('td')[-1].text
                            pub_time = tr.findAll('td')[-2].text
                            # print(href,pub_time,pub_hao,cw_time)
                            if '.html' in href:
                                res_ = requests.get(url=href, headers=headers)
                                res_.encoding = res_.apparent_encoding
                                res_text_ = res_.text
                                page = BeautifulSoup(res_text_, 'html.parser')
                                try:
                                    title = str(page.findAll('td', attrs={'class': 'zdgk_xlbt'})[1].text).replace(u'\xa0',
                                                                                                                  u' ').replace(
                                        ' ', '')
                                except:
                                    title = str(page.find('td', attrs={'class': 'zdgk_xlbt'}).text).replace(u'\xa0',
                                                                                                            u' ').replace(
                                        ' ',
                                        '')
                                content = str(page.find('td', attrs={'class': 'yhhei15'}))
                                try:
                                    pub_source = str(page.find('td', attrs={'class', 'heizi12'}).text).split()[0].replace(
                                        '来源：',
                                        '')
                                except:
                                    pub_source = ''
                                # print(title, pub_source)
                                fu_jian_href_list = []
                                fu_jian_result = re.findall('href="(.*?)"', content)
                                if len(fu_jian_result) > 0:
                                    for fu_jian_re in fu_jian_result:
                                        if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                                                or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                                                or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                                            try:
                                                fu_jian_href = href.replace('.shtml', '').replace('content_',
                                                                                                  '') + '/files' + \
                                                               fu_jian_re.split('/files')[1]
                                                fu_jian_href_list.append(fu_jian_href)
                                            except:
                                                continue
                                res_.close()
                            else:
                                title = str(tr.find('a').text).replace(u'\xa0', u' ').replace(' ', '')
                                pub_source = ''
                                content = ''
                                fu_jian_href_list = []
                            result_dict = {
                                '标题': title,
                                '来源': pub_source,
                                '号': pub_hao,
                                '内容': content,
                                '附件网址': fu_jian_href_list,
                                '发布时间': pub_time,
                                '网址': href,
                                '归属': '青海省国资委',
                            }
                            print(title)
                            save_data(result_dict)
                            num += 1
                        except:
                            pass
                res.close()
            except:
                pass
        end_time = time.time()
        print(f'共抓取{num}条数据,共耗时{end_time - start_time}')

    qing_hai1()
    qing_hai2()


# 河北
def he_bei():
    num = 0
    start_time = time.time()
    url = 'http://hbsa.hebei.gov.cn/Json/GFXWJ51.json'
    try:
        res = requests.get(url, headers)
        # print(res)
        json = res.json()
        # print(json)
        for info in json:
            title = info['title']
            content = info['content']
            id = info['id']
            href = 'http://hbsa.hebei.gov.cn/xxgk/GFXWJ?id=' + str(id)
            is_href = db_storage.find_one({'网址': href})
            if is_href:
                continue
            # 1679535459
            pub_time_ = info['updated']
            m = round(pub_time_ / 1000)  # 四舍五入取10位时间戳（秒级）
            n = time.localtime(m)  # 将时间戳转换成时间元祖tuple
            pub_time = time.strftime("%Y-%m-%d %H:%M:%S", n)[:10]  # 格式化输出时间
            soup = BeautifulSoup(content, 'html.parser')
            span_list = soup.find_all('span')
            for span in span_list:
                if '冀国资' in span.text and '号' in span.text and '〔' in span.text and '〕' in span.text:
                    pub_hao = span.text
                    if len(pub_hao) > 25:
                        pub_hao = ''
                else:
                    pub_hao = ''
            fu_jian_list = []
            attachmentList = info['attachmentList']
            for fu_jian in attachmentList:
                fu_jian_href = fu_jian['filepath']
                fu_jian_list.append(fu_jian_href)
            result_dict = {
                '标题': title,
                '来源': '',
                '号': pub_hao,
                '内容': content,
                '附件网址': fu_jian_list,
                '发布时间': pub_time,
                '网址': href,
                '归属': '河北省国资委',
            }
            print(title)
            save_data(result_dict)
            num = num + 1
    except:
        pass
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


# 湖北
def hu_bei():
    num = 0
    start_time = time.time()
    hrefs = []
    url = 'http://gzw.hubei.gov.cn/zfxxgk/zc/gfxwj/'
    chrome_driver = r'D:\spider\85\chromedriver.exe'
    path = Service(chrome_driver)
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.binary_location = r'D:\spider\85\Google\Chrome\Application\chrome.exe'
    driver = webdriver.Chrome(service=path, chrome_options=chrome_options)
    driver.get(url)
    time.sleep(2)
    ul = driver.find_element(By.ID, 'ulList')
    li_list = ul.find_elements(By.TAG_NAME, 'li')
    for li in li_list:
        href = li.find_element(By.TAG_NAME, 'a').get_attribute('href')
        hrefs.append(href)
    for href in hrefs:
        is_href = db_storage.find_one({'网址': href})
        if is_href:
            continue
        try:
            driver.get(href)
            article = driver.find_element(By.CLASS_NAME, 'article')
            title = article.find_element(By.TAG_NAME, 'h2').text.lstrip().strip()
            table = article.find_element(By.CLASS_NAME, 'table-bordered').text
            pub_come = table.split('发布机构')[1].split('发文日期')[0].lstrip().strip()
            pub_time = table.split('发文日期')[1].split('文    号')[0].replace('\n', '').replace('\r', '').lstrip().strip()
            pub_hao = table.split('文    号')[1].split('有 效 性')[0].lstrip().strip()
            content = str(article.find_element(By.CLASS_NAME, 'article-box').get_attribute('outerHTML'))
            fu_jian_result = re.findall('href="(.*?)"', content)
            fu_jian_href_list = []
            if len(fu_jian_result) > 0:
                for fu_jian_re in fu_jian_result:
                    if '.doc' in fu_jian_re or '.pdf' in fu_jian_re or '.xls' in fu_jian_re or '.zip' in fu_jian_re \
                            or '.rar' in fu_jian_re or '.ppt' in fu_jian_re or '.PDF' in fu_jian_re or '.DOC' in fu_jian_re \
                            or '.XLS' in fu_jian_re or '.ZIP' in fu_jian_re or '.RAR' in fu_jian_re:
                        fu_jian_href = fu_jian_re
                        fu_jian_href_list.append(fu_jian_href)
            result_dict = {
                '标题': title,
                '来源': pub_come,
                '号': pub_hao,
                '内容': content,
                '附件网址': fu_jian_href_list,
                '发布时间': pub_time,
                '网址': href,
                '归属': '湖北省国资委',
            }
            print(title)
            save_data(result_dict)
            num += 1
        except:
            pass
    driver.close()
    end_time = time.time()
    print(f'共抓取{num}条数据,共耗时{end_time - start_time}')


if __name__ == '__main__':
    # get_content1()
    # get_content2()
    # get_content3()
    bei_jing()
    # nei_meng_gu()
    # ji_lin()
    # shang_hai()
    # zhe_jiang()
    # fu_jian()
    # shan_dong()
    # guang_dong()
    # hai_nan()
    # si_chuan()
    # guang_xi()
    # gui_zhou()
    # yun_nan()
    # chong_qing()
    # tian_jin()
    # xin_jiang()
    # shan_xi()
    # liao_ning()
    # hei_long_jiang()
    # jiang_su()
    # an_hui()
    # jiang_xi()
    # he_nan()
    # hu_nan()
    # gan_su()
    # ning_xia()
    # xi_zang()
    # shanxi()
    # qing_hai()
    # he_bei()
    # hu_bei()
