import requests
import json
import sys

import redis
import requests
from bs4 import BeautifulSoup
from kafka import KafkaProducer
sys.path.append('D:\\kkwork\\zzsn_spider\\base')
import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
    }

def two_dfsm_mtgc():
    info_list = []
    """
    地方扫描
    """
    url_list = ['http://www.sasac.gov.cn/n2588025/n2588129/index.html',
                # 'http://www.sasac.gov.cn/n2588025/n2588139/index.html'
                ]
    for url in url_list:
        res = requests.get(url=url,headers=headers)
        res.encoding = res.apparent_encoding
        res_text = res.text
        soup = BeautifulSoup(res_text, 'html.parser')
        pages = soup.find('td', class_='pages')
        pages_tag = pages['id'].split('pag_')[1]
        pages = str(pages).split(f'maxPageNum{pages_tag}=')[1].split('";')[0]
        # print(pages)
        # for page in range(378,int(pages)+1):
        for page in range(1,378):
            log.info(f'==============开始采集第{page}页===============')
            if page == 1:
                url = 'http://www.sasac.gov.cn/n2588025/n2588129/index.html'
            else:
                url = f'http://www.sasac.gov.cn/n2588025/n2588129/index_{pages_tag}_{int(pages)+1-page}.html'
            try:
                res = requests.get(url=url, headers=headers)
            except:
                continue
            res.encoding = res.apparent_encoding
            res_text = res.text
            soup = BeautifulSoup(res_text, 'html.parser')
            li_list = soup.find('span', id=f'comp_{pages_tag}')
            if li_list:
                li_list = li_list.find_all('li')
            else:
                li_list = soup.find_all('li')
            for li in li_list:
                # print(type(li))
                if len(li):
                    a = li.find('a')
                    # print(a)
                    href = a['href']
                    if 'http' in href:
                        href = href
                    else:
                        href = 'http://www.sasac.gov.cn/' + str(href).replace('../../','')
                    # print(href)
                    try:
                        flag = r.sismember('IN-20240129-0019-test', href)
                        if flag:
                            log.info('信息已采集入库过')
                            continue
                        # else:
                        #     log.info(f'未采到----{page}-----{href}')
                        #     continue
                    except Exception as e:
                        continue
                    # href = "http://www.sasac.gov.cn/n2588025/n2588129/c2711101/content.html"
                    try:
                        title = a['title']
                    except:
                        title = ''
                    # print(title)
                    try:
                        res_href = requests.get(url=href,headers=headers,verify=False)
                    except:
                        continue
                    res_href.encoding = res_href.apparent_encoding
                    href_text = res_href.text
                    i_soup = BeautifulSoup(href_text,'html.parser')
                    result = i_soup.find(class_='zsy_cotitle')
                    try:
                        if result:
                            result =result.find('p').text
                            pub_source = result.split('发布时间：')[0].replace('文章来源：','').strip()
                            pub_time = result.split('发布时间：')[1]
                            # print(pub_source,pub_time)
                            try:
                                i_soup.find('div', id='div_div').decompose()
                                i_soup.find('div', id='qr_container').decompose()
                            except:
                                pass
                            contentWithTag = str(i_soup.find(class_='zsy_comain'))
                            content = str(i_soup.find(class_='zsy_comain').text).replace('扫一扫在手机打开当前页','')
                        else:
                            result = i_soup.find(class_='lyshijian').find_all('span')
                            try:
                                pub_source = str(result[0]).split('文章来源：')[1].split('</span>')[0].strip()
                                pub_time = str(result[1]).split('发布时间：')[1].split('</span>')[0].strip()
                            except:
                                pub_time = str(result[0]).split('发布时间：')[1].split('</span>')[0].strip()
                                pub_source =''
                            contentWithTag = str(i_soup.find(class_='pages_content'))
                            content = str(i_soup.find(class_='articlecontent').text)
                        if title == '':
                            log.info(f'title为空----{page}--{title}--{href}')
                            continue
                        info_code = 'IN-20240129-0019'
                        result_dict = {
                            'id': '',
                            'sid': '1751849444877144065',
                            'title': title,
                            'organ': pub_source,
                            'origin': '国务院国有资产监督管理委员会',
                            # '摘要': zhaiyao,
                            'source': 16,
                            'content': content,
                            'contentWithTag': contentWithTag,
                            'publishDate': pub_time,
                            'sourceAddress': href,
                        }
                        log.info(f'{page}--{title}--{href}')
                        # info_list.append(result_dict)
                        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
                        try:
                            kafka_result = producer.send("crawlerInfo",
                                                         json.dumps(result_dict, ensure_ascii=False).encode('utf8'))
                            r.sadd(info_code + '-test', href)
                            log.info('发送kafka成功！')
                        except Exception as e:
                            log.info(e)
                        finally:
                            producer.close()
                    except:
                        continue
if __name__ == "__main__":
    r = redis.Redis(host='114.115.236.206', port=6379, password='clbzzsn', db=5)
    two_dfsm_mtgc()