import os
import time
import requests
from bs4 import BeautifulSoup


from ClassTool import ClassTool
baseTool = ClassTool()

from BaseCore import BaseCore
baseCore = BaseCore()
log = baseCore.getLogger()

# 福建
def fu_jian():
    error_tag = str(404)
    num = 0
    count = 0
    start_time = time.time()
    url = 'http://gzw.fujian.gov.cn/zwgk/zcfg/'
    try:
        resp_text = requests.get(url=url, headers=baseTool.headers, verify=False)
        resp_text.encoding = 'utf-8'
        html = resp_text.text
        soup = BeautifulSoup(html, 'html.parser')
        # print(soup)
        result = soup.find_all(class_='borbot-line')
        for li_list in result:
            li = li_list.find_all('li')
            for a in li:
                id_list = []
                # print(a)
                a_text = str(a)
                title = a_text.split('title="')[-1].split('">')[0].replace('\n', '')
                href_ = str(a.find('a').get('href'))  # 网站链接
                href = href_.replace('../', './').replace('./', 'http://gzw.fujian.gov.cn/zwgk/')
                href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
                href_text.encoding = href_text.apparent_encoding
                i_html = href_text.text
                i_soup = BeautifulSoup(i_html, 'html.parser')
                try:
                    error_ = str(i_soup.find('strong').text)
                except:
                    error_ = ''
                if error_ == error_tag:
                    href = href_.replace('../', './').replace('./', 'http://gzw.fujian.gov.cn/zwgk/zcfg/')
                    href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
                    href_text.encoding = href_text.apparent_encoding
                    i_html = href_text.text
                    i_soup = BeautifulSoup(i_html, 'html.parser')
                    try:
                        error_ = str(i_soup.find('strong').text)
                    except:
                        error_ = ''
                    if error_ == error_tag:
                        href = href_.replace('../../', 'http://gzw.fujian.gov.cn/')
                        href_text = requests.get(url=href, headers=baseTool.headers, verify=False)
                        href_text.encoding = href_text.apparent_encoding
                        i_html = href_text.text
                        i_soup = BeautifulSoup(i_html, 'html.parser')
                real_href = href
                # real_href = 'http://gzw.fujian.gov.cn/zwgk/zcfg/201806/t20180619_3065065.htm'
                # print(real_href)
                is_href = baseTool.db_storage.find_one({'网址': real_href})
                if is_href:
                    num += 1
                    continue
                try:
                    # 文章是远程pdf
                    # 直接下载文件至服务器，解析出正文内容
                    if '.pdf' in real_href:
                        # pass
                        resp_content = requests.get(real_href, headers=baseTool.headers, verify=False, timeout=20).content
                        # 解析出pdf内容
                        content = baseCore.pdf_content(resp_content)
                        contentwithtag = ''
                        category = os.path.splitext(real_href)[1]
                        if category not in title:
                            file_name = title + category
                        # 文件上传至服务器
                        retData = baseCore.uptoOBS(real_href, '1673', file_name)
                        if retData['state']:
                            pass
                        else:
                            continue
                        att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num, '')
                        id_list.append(att_id)
                        pub_hao = ''
                        pub_time = None
                        pub_source = ''

                    else:
                        try:
                            href_text = requests.get(url=real_href, headers=baseTool.headers, verify=False)
                            href_text.encoding = href_text.apparent_encoding
                            i_html = href_text.text
                            i_soup = BeautifulSoup(i_html, 'html.parser')
                            # 相对路径转化为绝对路径
                            i_soup = baseTool.paserUrl(i_soup, real_href)
                            source_ = str(i_soup.find('div', attrs={'class': 'xl_tit2_l'}).text)
                            pub_source = source_.split('来源：')[1].split('发布时间：')[0].strip().lstrip()
                            pub_time = source_.split('发布时间：')[1].split('浏览量：')[0].strip().lstrip()
                            contentwithtag = i_soup.find('div', attrs={'class': 'xl_con1'})
                            content = i_soup.find('div', attrs={'class': 'xl_con1'}).text
                            if content == '' or content == None:
                                log.info(f'-----{href}----{title}----内容为空-----')
                                continue
                            pub_hao = ''
                            # print(real_href)
                            # todo:获取附件地址
                            try:
                                fu_jian_list = i_soup.find('ul', class_='clearflx myzj_xl_list').find_all('a')
                            except:
                                pass
                                fu_jian_list = []
                            for fu_jian in fu_jian_list:
                                try:
                                    fj_href = fu_jian['href']
                                except:
                                    continue
                                file_name = fu_jian.text
                                if '.doc' in fj_href or '.docx' in fj_href or '.xlsx' in fj_href or '.pdf' in fj_href or '.xls' in fj_href or '.zip' in fj_href \
                                        or '.rar' in fj_href or '.ppt' in fj_href or '.PDF' in fj_href or '.DOC' in fj_href \
                                        or '.XLS' in fj_href or '.ZIP' in fj_href or '.RAR' in fj_href:
                                    category = os.path.splitext(fj_href)[1]
                                    if category not in file_name:
                                        file_name = file_name + category
                                    print(fj_href)
                                    # 找到附件后 上传至文件服务器
                                    retData = baseCore.uptoOBS(fj_href, '1673', file_name)
                                    if retData['state']:
                                        pass
                                    else:
                                        continue
                                    att_id, full_path = baseCore.tableUpdate(retData, '福建省国资委', file_name, num,
                                                                             pub_time)
                                    id_list.append(att_id)
                                    # 将文件服务器的链接替换
                                    fu_jian['href'] = 'http:zzsn.luyuen.com/' + str(full_path)

                        except:
                            pub_source = ''
                            pub_time = None
                            contentwithtag = i_soup.find('tabs tab_base_01 rules_con1')
                            content = contentwithtag.text.strip()
                            if content == '' or content == None:
                                log.info(f'-----{href}----{title}----内容为空-----')
                                continue
                            pub_hao = contentwithtag.find_all('div', class_='rules_tit1 b-free-read-leaf').text.dtrip()

                    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                    # todo:传kafka字段
                    dic_news = {
                        'attachmentIds': id_list,
                        'author': '',
                        'content': content,
                        'contentWithTag': str(contentwithtag),
                        'createDate': time_now,
                        'deleteFlag': 0,
                        'id': '',
                        'labels': [{'relationId': "1673", 'relationName': "福建省国资委", 'labelMark': "policy"}],
                        'origin': pub_source,
                        'organ': '',
                        'topicClassification': '',
                        'issuedNumber': pub_hao,
                        'publishDate': pub_time,
                        'writtenDate': None,
                        'sid': '1697458829758697473',
                        'sourceAddress': real_href,
                        'summary': '',
                        'title': title
                    }
                    # log.info(dic_news)
                    flag = baseTool.sendKafka(dic_news)
                    if flag:
                        baseTool.save_data(dic_news)
                        log.info(title)
                        num += 1
                        count += 1
                except:
                    pass
    except:
        pass
    end_time = time.time()
    log.info(f'共抓取{count}条数据，共耗时{end_time - start_time}')

if __name__ == "__main__":
    fu_jian()