import datetime
import os
import re
import time

import requests
from lxml import etree
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 山西
def shan_xi():

    num = 0
    count = 0
    start_time = time.time()
    for page in range(1, 7):
        if page == 1:
            url = 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/'
        else:
            url = f'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/index_{page - 1}.shtml'
        try:
            res = requests.get(url, baseTool.headers)
            page_text = res.text.encode("ISO-8859-1")
            page_text = page_text.decode("utf-8")
            tree = etree.HTML(page_text)
            tr_list = tree.xpath(
                '/html/body/table[3]/tbody/tr/td[2]/table/tbody/tr[3]/td/table[2]/tbody/tr[3]/td/form/table/tbody/tr')
            for tr in tr_list:
                href = tr.xpath('./td[1]/a/@href')
                if href == []:
                    continue
                href = href[0].replace('../../', 'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/').replace('./',
                                                                                                        'http://gzw.shanxi.gov.cn/zxhrdgz/fzyd/zywj/')
                title = tr.xpath('./td[1]/a/span//text()')[0]
                publishDate_ = str(tr.xpath('./td[2]/span/text()')[0]).strip()
                time_obj = datetime.datetime.strptime(publishDate_, "%Y/%m/%d")
                # 将datetime对象格式化为年月日的字符串
                publishDate = time_obj.strftime("%Y-%m-%d")

                is_href = baseTool.db_storage.find_one({'网址': href})
                if is_href:
                    num += 1
                    continue
                try:
                    if ".pdf" in href:
                        content = ''
                        publishDate = None
                        origin = ''
                        fu_jian_soup = [href]
                        contentWithTag = ''
                    else:
                        res = requests.get(href, baseTool.headers)
                        page_text = res.text.encode("ISO-8859-1")
                        page_text = page_text.decode("utf-8")
                        page = baseTool.paserUrl(page_text, href)
                        doc = pq(str(page))
                        title = doc('title').text()
                        origin = ''
                        contentWithTag = doc('div[id="vsb_content"]')
                    soup = baseTool.paserUrl(str(contentWithTag), href)
                    if len(fu_jian_soup) < 1:
                        fu_jian_soup = soup.find_all('a')
                    id_list = []
                    for file in fu_jian_soup:
                        try:
                            file_href = file['href']
                        except Exception as e:
                            continue
                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
                            if category not in file_name:
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1684', file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '山西省国资委', file_name, num, publishDate)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                    # id_ = redefid(id_list)
                    contentWithTag = str(soup.prettify())
                    if len(contentWithTag) < 1:
                        if len(fu_jian_soup) < 1:
                            continue
                    content = soup.text
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
                        continue
                    pattern = r'(晋国资.{1,}?号)|(国资.{1,}?号)'
                    match_list = re.findall(pattern, content)
                    if len(match_list) > 0:
                        issuedNumber = match_list[0][0]
                        if len(issuedNumber) > 20:
                            issuedNumber = ''
                    else:
                        issuedNumber = ''
                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': str(content),
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1684", 'relationName': "山西省国资委", 'labelMark': "policy"}],
                        'origin': origin,
                        'organ': "",
                        'topicClassification': "",
                        'issuedNumber': issuedNumber,
                        'publishDate': publishDate,
                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }
                    # print(dic_news)
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        num += 1
                        count += 1
                except Exception as e:
                    pass
        except Exception as e:
            pass
    end_time = time.time()
    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

if __name__ == "__main__":
    shan_xi()