import os
import re
import time
import datetime

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from retry import retry
from selenium.webdriver.common.by import By

import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

from reits import Policy
policy = Policy()


topic = 'research_center_fourth'
webname = '云南省人民政府'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
}


def getSoup(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


@retry(tries=3, delay=5)
def getFjContent(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.content


def getContent(url, publishDate, num):
    id_list = []
    soup = getSoup(url)
    policy.paserUrl(soup, url)
    contentWithTag = soup.find('div', class_='content')
    if not contentWithTag:
        contentWithTag = soup.find('div', class_='TRS_UEDITOR')
    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = contentWithTag.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    content = contentWithTag.text.lstrip().strip()
    num_ = 1
    a_list = contentWithTag.find_all('a')
    for a in a_list:
        fj_title = a.text.lstrip().strip()
        fj_href = a.get('href')
        if 'http' not in fj_href:
            fj_href = 'https://www.yn.gov.cn' + fj_href

        if fj_title == '':
            fj_title = str(num_)
            num_ += 1
        category = os.path.splitext(fj_href)[1]
        if category not in fj_title:
            fj_title = fj_title + category
        att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
        if att_id:
            id_list.append(att_id)
            a['href'] = full_path

    try:
        a_list = soup.find('ul', class_='apfile').find_all('a')
        for a in a_list:
            fj_title = a.text.lstrip().strip()
            fj_href = a.get('href')
            if 'http' not in fj_href:
                fj_href = 'https://www.yn.gov.cn' + fj_href

            if fj_title == '':
                fj_title = str(num_)
                num_ += 1
            category = os.path.splitext(fj_href)[1]
            if category not in fj_title:
                fj_title = fj_title + category
            att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
            if att_id:
                id_list.append(att_id)
                a['href'] = full_path
    except:
        pass
    return content, contentWithTag, id_list


def getData(div, num):

    pattern = r"\d{4}-\d{2}-\d{2}"
    title = div.find_element(By.CLASS_NAME, 'title').find_element(By.CLASS_NAME, 'fontlan').get_attribute(
        'title').lstrip().strip()
    href = div.find_element(By.CLASS_NAME, 'fontlan').get_attribute('href')

    # 根据链接判重
    is_member = baseCore.r.sismember('REITs::' + webname, href)
    if is_member:
        return
    origin = '云南省人民政府'
    try:
        publishDate = re.findall(pattern, div.find_element(By.CLASS_NAME, 'content').text)[0]
    except:
        publishDate = None
    try:
        organ = \
            div.find_element(By.CLASS_NAME, 'rowtab').find_elements(By.TAG_NAME, 'div')[0].find_elements(By.TAG_NAME,
                                                                                                         'p')[
                1].find_element(By.CLASS_NAME, 'txt').text.lstrip().strip()
        pub_hao = \
            div.find_element(By.CLASS_NAME, 'rowtab').find_elements(By.TAG_NAME, 'div')[0].find_elements(By.TAG_NAME,
                                                                                                         'p')[
                0].find_element(By.CLASS_NAME, 'txt').text.lstrip().strip()
        if pub_hao == '无':
            pub_hao = ''
    except:
        organ = ''
        pub_hao = ''
    summary = ''
    writtenDate = None
    if '.pdf' in href or '.PDF' in href:
        id_list = []
        content = ''
        contentWithTag_str = ''
        fj_href = href
        fj_title = title + '.pdf'
        att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
        if att_id:
            id_list.append(att_id)
    else:
        content, contentWithTag, id_list = getContent(href, publishDate, num)
        contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_info = {
        'attachmentIds': id_list,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
        'id': '',
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
        'sourceAddress': href,
        'writtenDate': writtenDate,
        'organ': organ,
        'topicClassification': '',
        'issuedNumber': pub_hao,
        'summary': summary,
        'createDate': time_now,
        'sid': '1729046848292892673'
    }
    try:
        baseCore.sendkafka(dic_info, topic)
        baseCore.r.sadd('REITs::' + webname, href)
        log.info(f'采集成功--{title}--{href}')
    except Exception as e:
        for att_id in id_list:
            baseCore.deliteATT(att_id)
    return

def doJob():

    url = 'https://sheng.so-gov.cn/s?siteCode=5300000033&qt=REITs'
    driver = policy.createDriver()
    driver.get(url)
    time.sleep(2)
    num = 1
    for type in range(3, 5):
        driver.find_elements(By.XPATH, '/html/body/div/div[6]/div[2]/div[3]/ul/li')[type].click()
        time.sleep(2)
        if type == 3:
            driver.find_element(By.ID, 'key_place_context_id').click()
            time.sleep(2)
        try:
            total = int(driver.find_element(By.CLASS_NAME, 'pagination').find_elements(By.TAG_NAME, 'a')[-2].text)
        except:
            total = 1
        for page in range(total):
            time.sleep(2)
            div_list = driver.find_elements(By.XPATH, '//*[@id="results"]/div')
            for div in div_list:
                getData(div, num)
                num += 1
            try:
                driver.find_element(By.CLASS_NAME, 'pagination').find_element(By.CLASS_NAME, 'next').click()
            except:
                pass



if __name__ == '__main__':
    doJob()
    baseCore.close()
