import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 湖南
def hu_nan():
    num = 0
    count = 0
    pathType = 'policy/hunan/'
    start_time = time.time()
    for page in range(1, 7):
        if page == 1:
            # http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index.html
            url = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index.html'
        else:
            url = f'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/index_{page}.html'
        try:
            resp_text = requests.get(url=url, headers=baseTool.headers, verify=False).content
            doc_resp = pq(resp_text)
            doc_items = doc_resp('.table tbody tr').items()
            for doc_item in doc_items:
                href = 'http://gzw.hunan.gov.cn' + doc_item('a').attr('href')
                publishDate = doc_item('td:nth-child(3)').text()
                is_href = baseTool.db_storage.find_one({'网址': href})
                if is_href:
                    num += 1
                    continue
                # href = 'http://gzw.hunan.gov.cn/gzw/xxgk_71571/zcfg/201109/t20110920_1942364.html'
                try:
                    res = requests.get(url=href, headers=baseTool.headers, verify=False)
                    res.encoding = res.apparent_encoding
                    res_text = res.text
                    # soup = BeautifulSoup(res_text, 'html.parser')
                    soup = baseTool.paserUrl(res_text, href)
                    # pub_result = str(soup.find('div', attrs={'class': 'information-zt-list fn-clear'}).text)
                    # writtenDate = pub_result.split('发文日期：')[1].split('名称：')[0].strip() + ':00'
                    # title = pub_result.split('名称：')[1].split('主题分类：')[0].lstrip().strip()
                    # organ = pub_result.split('发布机构: ')[1].split('if(')[0].lstrip().strip()
                    doc = pq(str(soup))
                    organ = doc('div[class="information-zt-list fn-clear"]>ul>li:nth-child(3)').text().replace('发布机构：',
                                                                                                               '')
                    if 'document.write' in organ:
                        organ = ''
                    writtenDate = doc('div[class="information-zt-list fn-clear"]>ul>li:nth-child(4)').text().replace(
                        '发文日期：', '')
                    title = doc('div[class="information-zt-list fn-clear"]>ul>li:nth-child(5)').text().replace('名称：',
                                                                                                               '')
                    topicClassification = doc(
                        'div[class="information-zt-list fn-clear"]>ul>li:nth-child(6)').text().replace('主题分类：', '')
                    contentWithTag = doc('div[class="information-zt-show"]')
                    soup = BeautifulSoup(str(contentWithTag), 'html.parser')
                    fu_jian_soup = soup.find_all('a')
                    id_list = []
                    for file in fu_jian_soup:
                        try:
                            file_href = file['href']
                        except Exception as e:
                            continue
                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
                            if category not in file_name:
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1691', file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '湖南省国资委', file_name, num, publishDate)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
                            file['href'] = 'http:obs.ciglobal.cn/' + str(full_path)

                    contentWithTag = str(soup.prettify())
                    content = soup.text
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
                        continue
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': str(content),
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1691", 'relationName': "湖南省国资委", 'labelMark': "policy"}],
                        'origin': '',
                        'organ': organ,
                        'topicClassification': topicClassification,
                        'issuedNumber': '',
                        'publishDate': publishDate,
                        'writtenDate': writtenDate,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }
                    # print(dic_news)
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        num += 1
                        count += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

if __name__ == "__main__":
    hu_nan()