import datetime
import json
import time

import redis
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

from kafka import KafkaProducer

from base.BaseCore import BaseCore
baseCore = BaseCore()

log = baseCore.getLogger()
r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn', db=0)
def sendKafka(dic_news):
    try:
        producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'],max_request_size=1024*1024*20)
        kafka_result = producer.send("crawlerInfo",
                                     json.dumps(dic_news, ensure_ascii=False).encode('utf8'))

        print(kafka_result.get(timeout=10))

        dic_result = {
            'success': 'ture',
            'message': '操作成功',
            'code': '200',
        }
        log.info(dic_result)
        return True
    except Exception as e:
        dic_result = {
            'success': 'false',
            'message': '操作失败',
            'code': '204',
            'e': e
        }
        log.info(dic_result)
        return False

def getRequest(url,headers):
    req = requests.get(url=url, headers=headers, timeout=30)
    if req.status_code == 200:
        pass
    soup = BeautifulSoup(req.content, 'html.parser')
    return soup

def deletep(soup,attribute_to_delete,value_to_delete):
    # 查找带有指定属性的P标签并删除
    p_tags = soup.find_all('p', {attribute_to_delete: value_to_delete})
    for p_tag in p_tags:
        p_tag.decompose()

def deletek(soup):
    # 删除空白标签（例如<p></p>、<p><br></p>, img、video、hr除外）
    for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video", "br"] and tag.name != "br" or tag.get_text()==' '):
        for j in i.descendants:
            if j.name in ["img", "video", "br"]:
                break
        else:
            i.decompose()

# 将html中的相对地址转换成绝对地址
def paserUrl(html, listurl):
    # 获取所有的<a>标签和<img>标签
    if isinstance(html, str):
        html = BeautifulSoup(html, 'html.parser')

    links = html.find_all(['a', 'img'])
    # 遍历标签，将相对地址转换为绝对地址
    for link in links:
        if 'href' in link.attrs:
            link['href'] = urljoin(listurl, link['href'])
        elif 'src' in link.attrs:
            link['src'] = urljoin(listurl, link['src'])
    return html

if __name__=='__main__':
    headers = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'max-age=0',
        'Cookie':'UM_distinctid=18b5f64f72a580-0d0997e58eee04-26031e51-e1000-18b5f64f72bab5; wdcid=23a1d057521777ff; wdses=22f0d407e263a31e; CNZZDATA30019853=cnzz_eid%3D744929620-1698112534-%26ntime%3D1698112562; wdlast=1698112562',
        'Host':'www.qstheory.cn',
        'Proxy-Connection':'keep-alive',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
    }
    url = 'http://www.qstheory.cn/qs/mulu.htm'
    soup_report = getRequest(url,headers)
    report_list = soup_report.find_all('div', class_='col-sm-3')
    for book in report_list:
        href = book.find('div', class_='booktitle').find('a')['href']
        year = book.find('div', class_='booktitle').find('a').text
        soup_href = getRequest(href,headers)
        period = soup_href.find('div', class_='highlight')
        deletep(period,'align','center')
        deletek(period)
        period_list = period.find_all('p')
        for p in period_list:
            period_href = p.find('a')['href']
            period_title = p.find('a').text
            soup_news = getRequest(period_href,headers)
            deletep(soup_news, 'align', 'center')
            deletek(soup_news)
            title_list = soup_news.select('div[class="highlight"]>p')[1:]
            for new in title_list:
                try:
                    deletek(new)
                    try:
                        author = new.find('font', face='楷体').text.replace('/', '').replace('\u3000', ' ').replace('\xa0', '')
                    except:
                        continue
                    if len(author)>4:
                        continue
                    # if '（' in author or '本刊' in author or '国家' in author\
                    #     or '中共' in author or '记者' in author or '新闻社' in author\
                    #     or '党委' in author or '调研组' in author or '研究中心' in author\
                    #     or '委员会' in author or '博物' in author or '大学' in author or '联合会' in author :
                    # if '（' in author or '本刊' in author  \
                    #         or '记者' in author or '新闻社' in author \
                    #         or '”' in author\
                    #         or '大学' in author or '洛桑江村' in author:
                    #     continue
                    if '国资委党委' in author:
                        pass
                    else:
                        continue
                    new_href = new.find('a')['href']
                    is_member = r.sismember('qiushileaderspeech_two::' + period_title, new_href)
                    if is_member:
                        continue
                    new_title = new.find('a').text.replace('\u3000',' ').lstrip(' ').replace('——', '').replace('\xa0', '')
                except:
                    continue
                soup_new = getRequest(new_href,headers)
                deletek(soup_new)
                deletep(soup_new, 'style', 'TEXT-ALIGN: center')
                result = soup_new.find('div', class_='inner')
                if result:
                    pass
                else:
                    continue
                span_list = result.find_all('span')
                source = span_list[0].text.replace('来源：', '').strip('\r\n')
                pub_time = span_list[2].text.strip('\r\n')
                content = soup_new.find('div', class_='highlight').text
                paserUrl(soup_new, new_href)
                contentWithTag = soup_new.find('div', class_='highlight')
                nowDate = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

                dic_news = {
                    'sid': '1716996740019585025',
                    'title': new_title,
                    'source': "16",
                    'origin': source,
                    'author': author,
                    'publishDate': pub_time,
                    'content': content,
                    'contentWithTag': str(contentWithTag),
                    'sourceAddress': new_href,
                    "createDate": nowDate
                }
                log.info(dic_news)
                if sendKafka(dic_news):
                    r.sadd('qiushileaderspeech_two::' + period_title, new_href)
                    log.info(f'采集成功----{dic_news["sourceAddress"]}')


