import os
import re
import time
import requests
from bs4 import BeautifulSoup

from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 吉林
def ji_lin():
    start = time.time()
    num = 0
    count = 0
    url = 'http://gzw.jl.gov.cn/zwgk/zcwj/'
    try:
        resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
        resp_text.encoding = 'utf-8'
        html = resp_text.text
        soup = BeautifulSoup(html, 'html.parser')
        result = soup.find(class_='list ej_list')
        li_list = result.find_all('li')
        for a in li_list:
            id_list = []
            a_text = str(a)
            href = a.find('a')['href']  # 网站链接
            if re.findall('http', href):
                real_href = href
            else:
                real_href = url + a_text.split('href=".')[-1].split('" target="_blank')[0]
            title = a.find('a').text.replace('\n', '')
            is_href = baseTool.db_storage.find_one({'网址': real_href})
            if is_href:
                num += 1
                continue
            try:
                # real_href = 'http://gzw.jl.gov.cn/zwgk/zcwj//201906/t20190624_2310742.html'
                href_text = requests.get(url=real_href, headers=baseTool.headers, verify=False)
                i_html = href_text.text.encode("ISO-8859-1")
                i_html = i_html.decode("utf-8")
                i_soup = BeautifulSoup(i_html, 'html.parser')
                # print(i_soup)
                # 相对路径转化为绝对路径
                soup = baseTool.paserUrl(i_soup, real_href)
                soup.prettify()

                try:
                    i_come = i_soup.find('span', class_='source')
                    i_time = i_soup.find('span', class_='time')
                    pub_come = i_come.text.split('.write(" ')[1].split('");')[0].strip()
                    pub_time = i_time.text.split('时间：')[1].strip()
                except:
                    i_come = i_soup.find('div', class_='zsy_cotitle')
                    i_time = i_soup.find('div', class_='zsy_cotitle')
                    if (i_come):
                        # pub_come = i_come.find('p')
                        try:
                            pub_come = i_come.find('p').text.split('信息来源 > ')[1].split('发布时间：')[0].strip()
                        except:
                            pub_come = i_come.find('p').text.split('文章来源')[1].split('发布时间：')[0].strip()
                        # print(pub_time)
                        pub_time = i_time.find('p').text.split('发布时间：')[1].strip()
                        # print(pub_come)
                    else:
                        pub = i_soup.find(class_='share')
                        pub_time = pub.find(class_='left').find('span', class_='time').text
                        if '时间' in pub_time:
                            pub_time = pub_time.split('时间：')[1].strip()
                        pub_come = pub.find(class_='right').find('span', class_='source').text.split('来源：')[1].strip()
                        # print(pub_come)
                i_content = soup.find(class_='zsy_comain')
                if i_content:
                    # print(real_href)
                    # 去掉扫一扫
                    try:
                        soup.find('div', id='qr_container').decompose()
                        soup.find('div', id='div_div').decompose()
                    except:
                        i_content = soup
                    # 去掉style
                    # 去掉style标签
                    try:
                        for styleTag in soup.find_all('style'):
                            styleTag.extract()
                    except:
                        i_content = soup
                    contentWithTag = soup.find(class_='zsy_comain')
                    content = contentWithTag.text.strip()
                    if content == '' or content == 'None':
                        log.info(f'{real_href}-----{title}----内容为空')
                        continue
                    # 发文字号
                    find_hao = i_content.find_all('p')[:3]
                    pub_hao = ''
                    for j in find_hao:
                        if '号' in j.text:
                            pub_hao = j.text
                        else:
                            continue
                    fj = soup.find('div', style='width:920px; margin: 0 auto;')
                    if fj:
                        li_list = fj.find_all('li')
                        for li in li_list:
                            fu_jian_href = li.find('a')['href']
                            # 如果是附件
                            if '.pdf' in fu_jian_href or '.wps' in fu_jian_href or '.docx' in fu_jian_href or '.doc' in fu_jian_href or 'xls' in fu_jian_href or '.zip' in fu_jian_href \
                                    or '.rar' in fu_jian_href or '.ppt' in fu_jian_href or '.PDF' in fu_jian_href or '.DOC' in fu_jian_href \
                                    or '.XLS' in fu_jian_href or '.ZIP' in fu_jian_href or '.RAR' in fu_jian_href:
                                file_name = fu_jian_href.text.strip()
                                category = os.path.splitext(fu_jian_href)[1]
                                if category not in file_name:
                                    file_name = file_name + category
                                # print(fu_jian_href)
                                retData = baseCore.uptoOBS(fu_jian_href, '1670', file_name)
                                if retData['state']:
                                    pass
                                else:
                                    continue
                                att_id, full_path = baseCore.tableUpdate(retData, '吉林市国资委', file_name, num, pub_time)
                                id_list.append(att_id)
                                #
                                # # todo:将返回的地址更新到soup
                                li.find('a')['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                            else:
                                continue
                else:
                    i_content = soup.find(class_="content")
                    # 将文章中的附件字段删去
                    pattern = r'\d+\.'
                    # pattern = r"附件：\d+\.\s*(.*)"
                    for p in i_content.find_all('div')[-10:]:
                        p_text = p.text
                        matches = re.findall(pattern, p_text)
                        if matches:

                            for k in matches:
                                if k in p_text:
                                    p.extract()
                    contentWithTag = i_content
                    content = contentWithTag.text.strip()
                    if content == '' or content == 'None':
                        log.info(f'{real_href}-----{title}----内容为空')
                        continue
                    # 找到附件上传至文件服务器
                    fj_soup = i_soup.find('div', class_='wenjianfujian')
                    fj_list = fj_soup.find_all('a')
                    # for fu_jian_href in fj_list:
                    #     fj_href = fu_jian_href['href']
                    #     file_name = fu_jian_href.text.strip()
                    #     # 如果是附件
                    #     if '.pdf' in fj_href or '.wps' in fj_href or '.docx' in fj_href or '.doc' in fj_href or 'xls' in fj_href or '.zip' in fj_href \
                    #             or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                    #             or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
                    #         # print(fj_href)
                    #         category = os.path.splitext(fj_href)[1]
                    #         if category not in file_name:
                    #             file_name = file_name + category
                    #         retData = baseCore.uptoOBS(fj_href, '1670', file_name)
                    #         if retData['state']:
                    #             pass
                    #         else:
                    #             continue
                    #         att_id, full_path = baseCore.tableUpdate(retData, '吉林省国资委', file_name, num, pub_time)
                    #         id_list.append(att_id)
                    #         #
                    #         # # todo:将返回的地址更新到soup
                    #         fu_jian_href['href'] = 'http:zzsn.luyuen.com/' + str(full_path)
                    #     else:
                    #         continue

                if '扫一扫在手机打开当前页' in content:
                    content.replace('扫一扫在手机打开当前页', '')
                    soup.find('div', id='div_div').decompose()
                    soup.find('div', id='qr_container').decompose()
                else:
                    pass
                log.info(title)
                # print('............................................................')
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                # todo:传kafka字段
                dic_news = {
                    'attachmentIds': id_list,
                    'author': '',
                    'content': content,
                    'contentWithTag': str(contentWithTag),
                    'createDate': time_now,
                    'deleteFlag': 0,
                    'id': '',
                    'labels': [{'relationId': "1670", 'relationName': "吉林市国资委", 'labelMark': "policy"}],
                    'origin': pub_come,
                    'organ': '',
                    'topicClassification': '',
                    'issuedNumber': '',
                    'publishDate': pub_time,
                    'writtenDate': None,
                    'sid': '1697458829758697473',
                    'sourceAddress': real_href,
                    'summary': '',
                    'title': title
                }
                # 如果内容为空，则数据不传接口
                if content == '' or content == 'None':
                    continue
                else:
                    # print(dic_news)
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                    num = num + 1
                    count += 1
            except Exception as e:
                log.info(e)
                pass
    except:
        pass
    end = time.time()
    log.info(f'共{count}条...........共耗时 {end - start}秒')

if __name__ == "__main__":
    ji_lin()