import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 黑龙江
def hei_long_jiang():
    pathType = 'policy/heilongjiang/'
    num = 0
    count = 0
    start_time = time.time()
    for page in range(1, 3):
        url = f'http://gzw.hlj.gov.cn/common/search/a4e4f3e94596456db749bfb0f7937cc7?_isAgg=true&_isJson=true&_pageSize=10&_template=index&_rangeTimeGte=&_channelName=&page={page}'
        try:
            web = requests.get(url=url, headers=baseTool.headers, verify=False)
            text = web.json()
            rows = text['data']['rows']
            try:
                for row in range(int(rows)):
                    result = text['data']['results'][row]
                    title = result['title']
                    href = 'http://gzw.hlj.gov.cn' + result['url']
                    publishDate = result['publishedTimeStr']
                    list_all = text['data']['results'][row]['domainMetaList'][1]['resultList'][0]
                    if list_all['name'] == '文号':
                        pub_hao = list_all['value']
                    else:
                        pub_hao = ''
                    is_href = baseTool.db_storage.find_one({'网址': href})
                    if is_href:
                        num += 1
                        continue
                    try:
                        contentWithTag = text['data']['results'][row]['contentHtml']
                        href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
                        href_text.encoding = href_text.apparent_encoding
                        href_text = href_text.text
                        doc_href = BeautifulSoup(href_text, 'html.parser')
                        origin = doc_href.find(class_='ly')
                        if origin:
                            origin = origin.find('b').text
                        else:
                            origin = ''
                        soup = baseTool.paserUrl(str(contentWithTag), href)
                        fu_jian_soup = soup.find_all('a')
                        id_list = []
                        for file in fu_jian_soup:
                            try:
                                file_href = file['href']
                            except Exception as e:
                                continue
                            if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                    or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                    or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                                file_name = file.text.strip()
                                category = os.path.splitext(file_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                retData = baseCore.uptoOBS(file_href, '1687', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '江苏省国资委', file_name, num, publishDate)
                                id_list.append(att_id)
                                # todo:将返回的地址更新到soup
                                file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)

                        contentWithTag = str(soup.prettify())
                        content = soup.text
                        if content == '' or content == None:
                            log.info(f'-----{href}----{title}----内容为空-----')
                            continue
                        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                        # todo:传kafka字段
                        dic_news = {
                            'attachmentIds': id_list,
                            'author': '',
                            'content': str(content),
                            'contentWithTag': str(contentWithTag),
                            'createDate': time_now,
                            'deleteFlag': 0,
                            'id': '',
                            'labels': [{'relationId': "1687", 'relationName': "江苏省国资委", 'labelMark': "policy"}],
                            'origin': origin,
                            'organ': '',
                            'topicClassification': '',
                            'issuedNumber': pub_hao,
                            'publishDate': publishDate,
                            'writtenDate': None,
                            'sid': '1697458829758697473',
                            'sourceAddress': href,
                            'summary': '',
                            'title': title
                        }
                        # print(dic_news)
                        flag = baseTool.sendKafka(dic_news)
                        if flag:
                            baseTool.save_data(dic_news)
                            num += 1
                            count += 1
                    except:
                        pass
            except:
                pass
        except:
            pass
    end_time = time.time()
    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

if __name__ == "__main__":
    hei_long_jiang()