import datetime
import os
import re
import time
import requests
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 江西
def jiang_xi():
    """
    1-60
    61-120
    121-164
    """
    num = 0
    count = 0
    pathType = 'policy/jiangxi/'
    start_time = time.time()
    startrecord = 1
    endrecord = 60
    for page in range(1, 3):
        url = f"http://gzw.jiangxi.gov.cn/module/web/jpage/dataproxy.jsp?startrecord={startrecord}&endrecord={endrecord}&perpage=20"
        startrecord = endrecord + 1
        endrecord = endrecord + 60
        payload = "col=1&webid=175&path=http%3A%2F%2Fgzw.jiangxi.gov.cn%2F&columnid=22977&sourceContentType=1&unitid=402016&webname=%E6%B1%9F%E8%A5%BF%E7%9C%81%E5%9B%BD%E6%9C%89%E8%B5%84%E4%BA%A7%E7%9B%91%E7%9D%A3%E7%AE%A1%E7%90%86%E5%A7%94%E5%91%98%E4%BC%9A&permissiontype=0"
        header = {
            'Connection': 'keep-alive',
            'Accept': 'application/xml, text/xml, */*; q=0.01',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Origin': 'http://gzw.jiangxi.gov.cn',
            'Referer': 'http://gzw.jiangxi.gov.cn/col/col22977/index.html?uid=402016&pageNum=9',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cookie': 'JSESSIONID=F601A052571881210819664F5BD38015; JSESSIONID=6E54DB27D82E844B825DD675AE19E399'
        }
        try:
            resp_text = requests.request("POST", url, headers=header, data=payload).text
            href_list = re.findall("href='(.*?)'", resp_text)
            for href in href_list:
                is_href = baseTool.db_storage.find_one({'网址': href})
                if is_href:
                    num += 1
                    continue
                try:
                    href_res = requests.get(url=href, headers=baseTool.headers, verify=False)
                    href_res.encoding = href_res.apparent_encoding
                    href_text = href_res.text
                    soup = baseTool.paserUrl(href_text, href)
                    doc = pq(str(soup))
                    try:
                        # origin=soup.find(text='信息来源：').text.replace('信息来源：','')
                        origin = doc('td:contains("信息来源：")').text().replace('信息来源：', '')
                    except Exception as e:
                        origin = ''
                    title = doc('tr[class="biaoti"]>td:nth-child(1)').text().replace('标题:', '')
                    organ = doc('div[class="xxgk-quote"]>table>tbody>tr:nth-child(1)>td:nth-child(2)').text().replace(
                        '发文机关:', '')
                    pub_hao = doc('div[class="xxgk-quote"]>table>tbody>tr:nth-child(1)>td:nth-child(3)').text().replace(
                        '文号:', '')
                    topicClassification = doc(
                        'div[class="xxgk-quote"]>table>tbody>tr:nth-child(2)>td:nth-child(1)').text().replace('主题分类:',
                                                                                                              '')
                    writtenDate = doc(
                        'div[class="xxgk-quote"]>table>tbody>tr:nth-child(2)>td:nth-child(3)').text().replace('成文日期:',
                                                                                                              '')
                    # pub_result = str(soup.find('div', attrs={'class': 'xxgk-quote'}).text)
                    # title = pub_result.split('标？？？？？？题: ')[1].split('有？？效？？性: ')[0].lstrip().strip()
                    # organ = pub_result.split('发文机关:')[1].split('文？？？？？？号:')[0].lstrip().strip()
                    # pub_hao = pub_result.split('文？？？？？？号:')[1].split('主题分类: ')[0].lstrip().strip()
                    # writtenDate = pub_result.split('成文日期:')[1].split('标？？？？？？题: ')[0].lstrip().strip()
                    contentWithTag = doc('div[id="zoom"]')
                    soup = baseTool.paserUrl(str(contentWithTag), href)
                    fu_jian_soup = soup.find_all('a')
                    id_list = []
                    for file in fu_jian_soup:
                        try:
                            file_href = file['href']
                        except Exception as e:
                            continue
                        if '.pdf' in file_href or '.docx' in file_href or '.doc' in file_href or 'xlsx' in file_href or '.zip' in file_href \
                                or '.rar' in file_href or '.ppt' in file_href or '.PDF' in file_href or '.DOC' in file_href \
                                or '.XLS' in file_href or '.ZIP' in file_href or '.RAR' in file_href:
                            file_name = file.text.strip()
                            category = os.path.splitext(file_href)[1]
                            if category not in file_name:
                                file_name = file_name + category
                            retData = baseCore.uptoOBS(file_href, '1689', file_name)
                            if retData['state']:
                                pass
                            else:
                                continue
                            att_id, full_path = baseCore.tableUpdate(retData, '江西省国资委', file_name, num, writtenDate)
                            id_list.append(att_id)
                            # todo:将返回的地址更新到soup
                            file['href'] = 'http:zzsn.luyuen.com/' + str(full_path)

                    contentWithTag = str(soup.prettify())
                    content = soup.text
                    if content == '' or content == None:
                        log.info(f'-----{href}----{title}----内容为空-----')
                        continue
                    if len(pub_hao) < 1:
                        pattern = r'(赣国资.{1,}?号)|(国.{1,}?号)'
                        match_list = re.findall(pattern, content)
                        if len(match_list) > 0:
                            pub_hao = match_list[0][0]
                        else:
                            pub_hao = ''

                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': str(content),
                        'contentWithTag': str(contentWithTag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1689", 'relationName': "江西省国资委", 'labelMark': "policy"}],
                        'origin': origin,
                        'organ': organ,
                        'topicClassification': topicClassification,
                        'issuedNumber': pub_hao,
                        'publishDate': None,
                        'writtenDate': writtenDate,
                        'sid': '1697458829758697473',
                        'sourceAddress': href,
                        'summary': '',
                        'title': title
                    }
                    # print(dic_news)
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        num += 1
                        count += 1
                except:
                    pass
        except:
            pass
    end_time = time.time()
    log.info(f'共抓取{count}条数据,共耗时{end_time - start_time}')

if __name__ == "__main__":
    jiang_xi()