import time

import requests
from bs4 import BeautifulSoup

import os
import pandas as pd
import numpy as np

import BaseCore
baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

from reits import Policy
policy = Policy()

topic = 'research_center_fourth'
webname = '四川省人民政府_'
headers = {
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}


def getSoup(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


def getFjContent(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.content


def getDataJson():
    ip = baseCore.get_proxy()
    url = 'https://api.so-gov.cn/query/s'
    data_post = {
        'siteCode': '5100000062',
        'tab': 'zcwj',
        'qt': 'REITs',
        'keyPlace': '0',
        'sort': 'dateDesc',
        'fileType': '',
        'timeOption': '0',
        'locationCode': '510000000000',
        'page': '1',
        'pageSize': '20',
        'ie': 'c0e059a8-7a00-4fa9-9d70-873a5284d8a0',
    }
    req = requests.post(url, headers=headers, data=data_post, proxies=ip)
    req.encoding = req.apparent_encoding
    data_json = req.json()['resultDocs']
    return data_json


def getContent(url, publishDate, num):

    soup = getSoup(url)
    policy.paserUrl(soup, url)
    try:
        writtenDate = \
        soup.select('#szfcontentwrap2022 > div.zfwjwzcontent > div.topbox > ul > li')[3].text.split('成文日期：')[
            1].lstrip().strip()
    except:
        writtenDate = None
    try:
        contentWithTag = soup.select('.contText')[0]
    except:
        contentWithTag = soup.select('#cmsArticleContent')[0]
    content = contentWithTag.text

    return writtenDate, content, contentWithTag


def getData(data_, num):
    id_list = []
    title = data_['data']['title']
    publishDate = data_['data']['docDate']
    origin = data_['data']['siteLabel']['value']
    href = data_['data']['url']
    # 根据链接判重
    is_member = baseCore.r.sismember('REITs::' + webname, href)
    if is_member:
        return
    organ = data_['data']['myValues']['DOCPUBNAME']
    pub_hao = data_['data']['myValues']['DOCNOVAL']
    summary = ''
    if '.pdf' in href or '.PDF' in href:
        content = ''
        contentWithTag_str = ''
        writtenDate = None
        fj_title = title + '.pdf'
        fj_href = href
        att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
        if att_id:
            id_list.append(att_id)

    else:
        writtenDate, content, contentWithTag = getContent(href, publishDate, num)

        contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
        'subjectId': '1729315113088765953',
        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
        'sourceAddress': href,
        'writtenDate': writtenDate,
        'organ': organ,
        'topicClassification': '',
        'issuedNumber': pub_hao,
        'summary': summary,
        'createDate': time_now,
        'sid': '1729046053927178241'
    }
    try:
        baseCore.sendkafka(dic_info, topic)
        baseCore.r.sadd('REITs::' + webname, href)
        log.info(f'采集成功--{title}--{href}')
    except Exception as e:
        for att_id in id_list:
            baseCore.deliteATT(att_id)
    return


def doJob():
    num = 1
    data_json = getDataJson()
    for data_ in data_json:
        getData(data_, num)
        num += 1

if __name__ == '__main__':
    doJob()
    baseCore.close()
