import os
import random
import requests
from bs4 import BeautifulSoup
import time
from retry import retry
from urllib.parse import urljoin

import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

from reits import Policy
policy = Policy()


topic = 'policy'
webname = '北京市人民政府'

class Policy1():
    @retry(tries=3, delay=10)
    def getrequest_soup(self, url):
        ip = baseCore.get_proxy()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        }
        req = requests.get(url, headers=headers, proxies=ip)
        if req.status_code != 200:
            raise
        req.encoding = req.apparent_encoding
        result = BeautifulSoup(req.content, 'html.parser')
        req.close()
        return result

    @retry(tries=3, delay=10)
    def getrequest_soup_(self, url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
        }
        req = requests.get(url, headers=headers)
        if req.status_code != 200:
            raise
        req.encoding = req.apparent_encoding
        result = BeautifulSoup(req.content, 'html.parser')
        req.close()
        return result

    def getrequest_json(self, headers, url):
        ip = baseCore.get_proxy()
        req = requests.get(headers=headers, url=url, proxies=ip)
        result = req.json()
        req.close()
        return result

    def requestPost(self, headers, url, payload):
        # ip = baseCore.get_proxy()
        req = requests.post(headers=headers, url=url, data=payload)
        data_json = req.json()
        req.close()
        return data_json

    def requestPost_html(self, headers, url, payload):
        ip = baseCore.get_proxy()
        req = requests.post(headers=headers, url=url, data=payload, proxies=ip)
        result = BeautifulSoup(req.content, 'html.parser')
        req.close()
        return result

    def deletep(self, soup, i, tag, attribute_to_delete, value_to_delete):
        # 查找带有指定属性的标签并删除
        tags = soup.find_all(tag, {attribute_to_delete: value_to_delete})
        for tag in tags[:i]:
            tag.decompose()

    def deletespan(self, td):
        spans = td.find_all('span')
        for span in spans:
            span.extract()  # 删除span标签

    def deletetag(self, td, tag):
        tags = td.find_all(tag)
        for tag_ in tags:
            tag_.extract()  # 删除指定标签

    def deletetext(self, soup, tag, text):  # 删除带有特定内容的标签
        tags = soup.find_all(tag)[:10]
        for tag_ in tags:
            text_ = tag_.text
            if text in text_:
                tag_.extract()

    def deletek(self, soup):
        # 删除空白标签（例如<p></p>、<p><br></p>, img、video、hr除外）
        for i in soup.find_all(lambda tag: len(tag.get_text()) == 0 and tag.name not in ["img", "video",
                                                                                         "br"] and tag.name != "br" or tag.get_text() == ' '):
            for j in i.descendants:
                if j.name in ["img", "video", "br"]:
                    break
            else:
                i.decompose()

    def paserUrl(self, html, listurl):
        # 获取所有的<a>标签和<img>标签
        if isinstance(html, str):
            html = BeautifulSoup(html, 'html.parser')

        links = html.find_all(['a', 'img'])
        # 遍历标签，将相对地址转换为绝对地址
        for link in links:
            if 'href' in link.attrs:
                link['href'] = urljoin(listurl, link['href'])
            elif 'src' in link.attrs:
                link['src'] = urljoin(listurl, link['src'])
        return html


def getFjContent(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    }
    req = requests.get(url, headers=headers)
    req.encoding = req.apparent_encoding
    content = req.content
    req.close()
    time.sleep(5)
    return content


# 北京市人民政府 https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs
def beijing():
    # if not os.path.exists('./相关政策/北京市人民政府/政策文件'):
    #     os.makedirs('./相关政策/北京市人民政府/政策文件')
    policy1 = Policy1()
    url = 'https://www.beijing.gov.cn/so/ss/query/s'
    payload = {
        'siteCode': '1100000088',
        'tab': 'zcfg',
        'qt': 'REITs',
        'sort': 'relevance',
        'keyPlace': '0',
        'locationCode': '110000000000',
        'page': '1',
        'pageSize': '20',
        'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
    }
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Length': '148',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie': 'Path=/; Path=/; __jsluid_s=91bdb0d83098fd2e8a8455a9085a22e2; JSESSIONID=M2FmNDczYzYtMmNkYS00N2I0LThhNDgtYWJiMTdhOTIyZDI4; _va_ref=%5B%22%22%2C%22%22%2C1699515166%2C%22https%3A%2F%2Fdocs.qq.com%2F%22%5D; _va_ses=*; JSESSIONID=CD61DA650DB33324962A3BF2527672D0; arialoadData=false; _va_id=c7a63e4b2199befd.1699358536.2.1699515273.1699515166.; CPS_SESSION=2FEFDC54444B24762D057AD6BDE3C7BF',
        'Host': 'www.beijing.gov.cn',
        'Origin': 'https://www.beijing.gov.cn',
        'Referer': 'https://www.beijing.gov.cn/so/s?siteCode=1100000088&tab=zcfg&qt=REITs',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
        'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"'
    }
    data_list = []
    result = policy.requestPost(headers, url, payload)
    total = result['totalHits']
    page_size = result['currentHits']
    Max_page = int(total / page_size) + 1
    num = 1
    for page in range(0, Max_page):
        payload_page = {
            'siteCode': '1100000088',
            'tab': 'zcfg',
            'qt': 'REITs',
            'sort': 'relevance',
            'keyPlace': '0',
            'locationCode': '110000000000',
            'page': page + 1,
            'pageSize': '20',
            'ie': '89b5e964-dc3c-4a5b-8d80-f6c408769b4a'
        }
        data = policy.requestPost(headers, url, payload_page)
        info_list = data['resultDocs']
        for info_ in info_list:
            id_list = []

            info = info_['data']
            origin = info['siteLabel']['value'].lstrip().strip()
            title = info['titleO'].lstrip().strip()
            titleLabel = info['titleLabel']['value'].lstrip().strip()
            publishDate = info['docDate'].lstrip().strip()
            newsUrl = info['url'].lstrip().strip()
            # 根据链接判重
            is_member = baseCore.r.sismember('REITs::' + webname, newsUrl)
            if is_member:
                continue
            summary = info['summary'].lstrip().strip()
            summary = BeautifulSoup(summary, 'lxml').text.lstrip().strip()
            writtenDate = None
            pub_hao = ''
            organ = ''
            content = ''
            topicClassification = ''

            if titleLabel == '政策解读':
                try:
                    newssoup = policy1.getrequest_soup(newsUrl)
                except:
                    newssoup = policy1.getrequest_soup_(newsUrl)
                contentWithTag = newssoup.find('div', id='mainText')
                try:
                    scripts = contentWithTag.find_all('script')
                    for script in scripts:
                        script.decompose()
                except:
                    pass
                try:
                    styles = contentWithTag.find_all('style')
                    for style in styles:
                        style.decompose()
                except:
                    pass
                content = contentWithTag.text.lstrip().strip()
                contentWithTag_str = str(contentWithTag)
                organ = newssoup.find('div', class_='othermessage').find('p', class_='fl').text.split('来源：')[
                    1].lstrip().strip()
            elif titleLabel == '政策文件':
                try:
                    newssoup = policy1.getrequest_soup(newsUrl)
                except:
                    newssoup = policy1.getrequest_soup_(newsUrl)
                contentWithTag = newssoup.find('div', id='mainText')
                try:
                    scripts = contentWithTag.find_all('script')
                    for script in scripts:
                        script.decompose()
                except:
                    pass
                try:
                    styles = contentWithTag.find_all('style')
                    for style in styles:
                        style.decompose()
                except:
                    pass
                li_list = newssoup.find('ol', class_='doc-info').find_all('li')
                for li in li_list:
                    if '成文日期' in li.text:
                        writtenDate = li.find('span').text.lstrip().strip()
                content = contentWithTag.text.lstrip().strip()
                contentWithTag_str = str(contentWithTag)
                formatRows = info['formatRows']
                for row in formatRows:
                    for col in row['col']:
                        name = col['text']
                        if name == '相关附件':
                            tag_str = ''
                            value = col['value']
                            for i in range(len(value.keys())):
                                file_href = list(value.keys())[i]
                                file_name = list(value.values())[i]
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category

                                # 上传附件至obs
                                att_id, full_path = policy.attuributefile(file_name, file_href, num, publishDate)
                                if att_id:
                                    id_list.append(att_id)
                                    tag = newssoup.find('ul', class_='fujian').find_all('a')[i]
                                    tag['href'] = full_path
                                    tag_str += str(tag) + '<br>'
                            contentWithTag_str += tag_str
                        elif '号' in name:
                            pub_hao = col['value'].lstrip().strip()
                        elif '发文机构' in name:
                            organ = col['value'][0].lstrip().strip()
                        elif '主题分类' in name:
                            topicClassification = col['value'][0].lstrip().strip()
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            if content == '':
                continue
            dic_info = {
                'attachmentIds': id_list,
                'author': '',
                'content': content,
                'contentWithTag': contentWithTag_str,
                'deleteFlag': 0,
                'checkStatus': 1,
                'id': '',
                'title': title,
                'publishDate': publishDate,
                'origin': origin,
                'sourceAddress': newsUrl,
                'writtenDate': writtenDate,
                'organ': organ,
                'topicClassification': topicClassification,
                'issuedNumber': pub_hao,
                'summary': summary,
                'createDate': time_now,
                'sid': '1729041207245328385'
            }
            try:
                baseCore.sendkafka(dic_info, topic)
                baseCore.r.sadd('REITs::' + webname, newsUrl)
                log.info(f'采集成功--{title}--{newsUrl}')
            except:
                for att_id in id_list:
                    baseCore.deliteATT(att_id)
            time.sleep(random.randint(10, 20))
            num += 1

# if __name__ == '__main__':
#     beijing()
#     baseCore.close()
