import os
import re
import time

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By

import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

from reits import Policy
policy = Policy()


topic = 'research_center_fourth'
webname = '江苏省人民政府'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
}


def getSoup(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    soup = BeautifulSoup(req.text, 'html.parser')
    return soup


def getFjContent(url):
    ip = baseCore.get_proxy()
    req = requests.get(url, headers=headers, proxies=ip)
    req.encoding = req.apparent_encoding
    return req.content


def getContentA(url, num, publishDate, title, origin, summary):
    id_list = []
    soup = getSoup(url)
    organ = soup.find('div', class_='sp_time').text.split('来源：')[1].split('字体')[0].lstrip().strip()
    contentWithTag = soup.find('div', attrs={'id': 'zoom'})
    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = contentWithTag.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    try:
        num_ = 1
        img_list = contentWithTag.find_all('img')
        for img in img_list:
            fj_href = img.get('src')
            try:
                fj_href = 'http://www.jiangsu.gov.cn' + fj_href

                fj_title = img.get('title').lstrip().strip()
            except:
                if 'img/png' in fj_href:
                    fj_title = f'{title}-{num_}.png'
                elif 'img/jpg' in fj_href:
                    fj_title = f'{title}-{num_}.jpg'
                num_ += 1
            att_id, full_path = policy.attuributefile(fj_title, fj_href,num, publishDate)
            if att_id:
                id_list.append(att_id)
                img['href'] = full_path
            else:
                pass
    except:
        pass
    content = contentWithTag.text
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
        'subjectId': '1729315113088765953',
        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'checkStatus': 1,
        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
        'sourceAddress': url,
        'writtenDate': None,
        'organ': organ,
        'topicClassification': '',
        'issuedNumber': '',
        'summary': summary,
        'createDate': time_now,
        'sid': '1729042894974537730'
    }
    try:
        baseCore.sendkafka(dic_info, topic)
        baseCore.r.sadd('REITs::' + webname, url)
        log.info(f'采集成功--{title}--{url}')
    except Exception as e:
        for att_id in id_list:
            baseCore.deliteATT(att_id)
    return


def getContentB(url, num, publishDate, title, origin, summary):
    id_list = []
    soup = getSoup(url)
    info = soup.find('table', class_='xxgk_table').text.replace(' ','')
    organ = info.split('发布机构：')[1].split('发文日期')[0].lstrip().strip()
    writtenDate = info.split('发文日期：')[1].split('标题：')[0].lstrip().strip()
    pub_hao = info.split('文号：')[1].split('内容概述：')[0].lstrip().strip()
    contentWithTag = soup.find('div', class_='article_content')
    try:
        scripts = contentWithTag.find_all('script')
        for script in scripts:
            script.decompose()
    except:
        pass
    try:
        styles = contentWithTag.find_all('style')
        for style in styles:
            style.decompose()
    except:
        pass
    # try:
    num_ = 1
    img_list = contentWithTag.find_all('img')
    for img in img_list:
        fj_href = img.get('src')
        try:
            fj_title = img.get('title').lstrip().strip()

            fj_href = 'http://www.jiangsu.gov.cn' + fj_href

        except:
            if 'image/png' in fj_href:
                fj_title = f'{title}-{num_}.png'
            elif 'image/jpg' in fj_href:
                fj_title = f'{title}-{num_}.jpg'
            num_ += 1
        try:
            att_id, full_path = policy.attuributefile(fj_title, fj_href, num, publishDate)
        except:
            att_id = ''
        if att_id:
            id_list.append(att_id)
            img['href'] = full_path
        else:
            pass
    content = contentWithTag.text.lstrip().strip()
    contentWithTag_str = str(contentWithTag)
    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    lang = baseCore.detect_language(content)
    dic_info = {
        'attachmentIds': id_list,
        'subjectId': '1729315113088765953',
        'lang': lang,
        'author': '',
        'content': content,
        'contentWithTag': contentWithTag_str,
        'deleteFlag': 0,
        'id': '1729315113088765953'+str(int(time.time())),
        'title': title,
        'publishDate': publishDate,
        'origin': origin,
        'sourceAddress': url,
        'writtenDate': writtenDate,
        'organ': organ,
        'topicClassification': '',
        'issuedNumber': pub_hao,
        'summary': summary,
        'createDate': time_now,
        'sid': '1729042894974537730',
    }
    try:
        baseCore.sendkafka(dic_info, topic)
        baseCore.r.sadd('REITs::' + webname, url)
        log.info(f'采集成功--{title}--{url}')
    except Exception as e:
        for att_id in id_list:
            baseCore.deliteATT(att_id)
    return


def doJob():

    pattern = r"\d{4}-\d{2}-\d{2}"
    url = 'http://www.jiangsu.gov.cn/jsearchfront/search.do?websiteid=320000000100000&searchid=12&pg=&p=1&tpl=38&serviceType=&cateid=27&q=REITs&pq=&oq=&eq=&pos=&sortType=0&begin=&end='
    # driver = baseCore.buildDriver()
    driver = policy.createDriver()
    driver.get(url)
    time.sleep(5)
    div_list = driver.find_elements(By.CLASS_NAME,'news-result')
    num = 1

    for div in div_list:
        id_list = []
        title = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('title').lstrip().strip()
        href = div.find_element(By.CLASS_NAME, 'jcse-news-title').find_element(By.TAG_NAME,'a').get_attribute('href')
        # 根据链接判重
        is_member = baseCore.r.sismember('REITs::' + webname, href)
        if is_member:
            continue
        type = div.find_element(By.CLASS_NAME, 'biaoqian').text.lstrip().strip()
        summary = div.find_element(By.CLASS_NAME, 'jcse-news-abs-content').text.lstrip().strip()
        dateInfo = div.find_element(By.CLASS_NAME, 'jcse-news-date').text
        publishDate = re.findall(pattern, dateInfo)[0]
        origin = dateInfo.replace(publishDate, '').lstrip().strip()
        if type == '政务公开':
            getContentA(href, num, publishDate, title, origin, summary)

        else:
            getContentB(href, num, publishDate, title, origin, summary)
        num += 1
        time.sleep(5)
    driver.close()


if __name__ == '__main__':
    doJob()
    baseCore.close()
