import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 广西
def guang_xi():
    num = 0
    count = 0
    start_time = time.time()
    url_all = """
    http://gzw.gxzf.gov.cn/wjzx/2023nwj/  1
    http://gzw.gxzf.gov.cn/wjzx/2022nwj/  1
    http://gzw.gxzf.gov.cn/wjzx/2021nwj/  1
    http://gzw.gxzf.gov.cn/wjzx/2020nwj/  1
    http://gzw.gxzf.gov.cn/wjzx/2019nwj/  1
    http://gzw.gxzf.gov.cn/wjzx/2018nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2017nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2016nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2015nwj/  3
    http://gzw.gxzf.gov.cn/wjzx/2014nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2013nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2012nwj/  2
    http://gzw.gxzf.gov.cn/wjzx/2011nwj/  5
    http://gzw.gxzf.gov.cn/wjzx/wjhbdej2008n2010n/  1
    http://gzw.gxzf.gov.cn/wjzx/wjhbdyj2004n2007n/  1
    http://gzw.gxzf.gov.cn/wjzx/gfxwjhb2004n2013n/  1
    http://gzw.gxzf.gov.cn/wjzx/jshgfxwj2004n2015n/  1
    http://gzw.gxzf.gov.cn/wjzx/gfxwjhb2004n2015n/  1
    """
    url_list = url_all.split('\n')
    for url_info in url_list[1:-1]:
        url_info = url_info.strip()
        url_1 = url_info.split(' ')[0].strip()
        for page in range(0, 1):
            if page == 0:
                url = f'{url_1}index.shtml'
            else:
                url = f'{url_1}index_{page}.shtml'
            try:
                resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
                doc_resp = pq(resp_text)
                doc_items = doc_resp('#morelist li').items()
                for doc_item in doc_items:
                    id_list = []
                    title = doc_item('a').attr('title').strip()
                    href = url.split('index')[0] + doc_item('a').attr('href').replace('./', '')
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    try:
                        # print(href)
                        href_text = requests.get(url=href, headers=baseTool.headers, verify=False).content
                        doc_href = pq(href_text)
                        pub_result = doc_href('.article-inf-left').text()
                        pub_hao_result = doc_href('.article-h2').text()
                        if '﹝' in pub_hao_result and '﹞' in pub_hao_result:
                            pub_hao = pub_hao_result.replace('﹝', '〔').replace('﹞', '〕')
                        elif '〔' in pub_hao_result and '〕' in pub_hao_result:
                            pub_hao = pub_hao_result
                        else:
                            pub_hao = ''
                        pub_time = pub_result.split('来源：')[0].strip() + ':00'
                        try:
                            pub_source = pub_result.split('来源：')[1].split('作者：')[0].strip()
                        except:
                            pub_source = pub_result.split('来源：')[1].strip()
                        contentWithTag = doc_href('.article-con div:first-child')
                        # 相对路径转化为绝对路径
                        contentWithTag = BeautifulSoup(str(contentWithTag), 'html.parser')
                        contentWithTag = baseTool.paserUrl(contentWithTag, href)
                        content = contentWithTag.text.strip()
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                        fu_jian_list = contentWithTag.find_all('a')

                        for fu_jian in fu_jian_list:
                            try:
                                fu_jian_href = fu_jian['href']
                            except:
                                continue
                            file_name = fu_jian.text.strip()
                            if '.docx' in fu_jian_href or '.pdf' in fu_jian_href or '.xlsx' in fu_jian_href or '.zip' in fu_jian_href \
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                category = os.path.splitext(fu_jian_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                # 附件上传至文件服务器
                                retData = baseCore.uptoOBS(fu_jian_href, '1692', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue

                                att_id, full_path = baseCore.tableUpdate(retData, '广西壮族自治区国资委', file_name, num,
                                                                         pub_time)
                                id_list.append(att_id)
                                # 将附件链接替换
                                fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)

                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': content,
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1692", 'relationName': "广西壮族自治区国资委", 'labelMark': "policy"}],
                            'origin': '',
                            'organ': pub_source,
                            'topicClassification': '',
                            'issuedNumber': pub_hao,
                            'publishDate': pub_time,
                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            log.info(title)
                            num = num + 1
                    except:
                        pass
            except:
                pass
    end_time = time.time()
    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

if __name__ == "__main__":
    guang_xi()