# -*- coding: utf-8 -*-
# @Author: MENG
# @Time  : 2022-2-25
import time
import pymongo
import datetime
import requests
import json
import re
import base64
from kafka import KafkaProducer
from requests.packages import urllib3
from gridfs import GridFS
import os
from base import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()

urllib3.disable_warnings()

db_storage = pymongo.MongoClient(host='114.115.221.202', port=27017, username='admin', password='ZZsn@9988').中科软['数据源_0106']
client = pymongo.MongoClient(host='114.115.221.202', port=27017, username='admin', password='ZZsn@9988')
db = client['ZZSN']
# 获取GridFS对象
fs = GridFS(db)


# 推数据
def post_data(data):
    url = "https://103.83.45.34/overseasdata/news_info"
    payload = json.dumps(data)
    headers = {
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload, verify=False)
    #log.info(response.status_code)
    r = response.json()
    log.info(f'''{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}===推送数据==={r}''')
    try:
        return r['code'],r['msg']
    except:
        return r['status']


# 推图片
def post_img(data):
    url = "https://103.83.45.34/overseasdata/news_info/save_news_pic"
    payload = json.dumps(data)
    headers = {
        'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload, verify=False)
    r = response.json()
    log.info(f'''{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}==推送图片==={r}''')
    try:
        return r['code']
    except:
        return r['status']


# 推文件数据
def post_filedata(data):
    url = "https://103.83.45.34/overseasdata/news_info/upload_pdf"  # 生产
    payload = {}
    filename = data['pdfFilename']
    ids = data['ids']
    files = get_pdf_from_mongodb(filename, ids)
    headers = {}
    url = url + '?ids=' + ids
    response = requests.request("POST", url, headers=headers, data=payload, files=files)
    text = response.text
    # 检查文件是否存在
    file_path = ids + '.pdf'
    if os.path.exists(file_path):
        # 删除文件
        os.remove(file_path)
        log.info(f"文件 '{file_path}' 已成功删除")
    else:
        log.eroor(f"文件 '{file_path}' 不存在")
    return text


# 从mongodb中读取文件
def get_pdf_from_mongodb(filename, output_path):
    output_path = output_path + '.pdf'
    # 从GridFS获取PDF文件
    with open(output_path, 'wb') as file:
        file.write(fs.get_version(filename=filename).read())

    files = [
        ('file', (output_path, open(output_path, 'rb'), 'application/pdf'))
    ]
    return files


# 判断加推送
def pan_dun_and_tui_song():
    db_dict_list = []
    now = datetime.datetime.now()
    yes1 = now + datetime.timedelta(days=-2)
    date_yes = yes1.strftime('%Y-%m-%d')
    for db_dict in db_storage.find({'postCode': {'$in':['1','10']}, 'newsTime': {'$gte': date_yes}}):
        db_dict_list.append(db_dict)
    log.info(f"{date_yes}===需要推送{len(db_dict_list)}条数据")
    for db_dict in db_dict_list:
        try:
            columns = db_dict['columns'].strip()
        except:
            columns = db_dict['columns']
        try:
            name = db_dict['name'].strip()
        except:
            name = db_dict['name']
        try:
            titleForeign = db_dict['titleForeign'].strip()
        except:
            titleForeign = db_dict['titleForeign']
        try:
            title = db_dict['title'].strip()
        except:
            title = db_dict['title']
        try:
            richTextForeign = db_dict['richTextForeign'].strip()
        except:
            richTextForeign = db_dict['richTextForeign']
        try:
            contentForeign = db_dict['contentForeign'].strip()
        except:
            contentForeign = db_dict['contentForeign']
        try:
            pdfurl = db_dict['pdfurl'].strip()
        except:
            pdfurl = ''
        newsTime = db_dict['newsTime'].strip()
        CREATE_DATE = db_dict['CREATE_DATE'].strip()
        try:
            tag1 = db_dict['tag1'].strip()
        except:
            tag1 = db_dict['tag1']
        if len(newsTime) == 10:
            newsTime += ' 00:00:00'
        if db_dict['postCode'] == '1':
            try:
                content = db_dict['content'].strip()
            except:
                content = db_dict['content']
            try:
                richText = db_dict['richText'].strip()
            except:
                richText = db_dict['richText']
            try:
                tag2 = db_dict['tag2'].strip()
            except:
                tag2 = db_dict['tag2']
        elif db_dict['postCode'] == '10':
            content = db_dict['content']
            richText = db_dict['richText']
            tag2 = db_dict['tag2']
        if title == '' or content == '' or richText == '':
            db_storage.update_one({'_id': db_dict['_id']}, {'$set': {'postCode': '403'}})
            continue
        if '? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?' in content:
            db_storage.update_one({'_id': db_dict['_id']}, {'$set': {'postCode': '403'}})
            continue
        if '???????????????????????' in content:
            db_storage.update_one({'_id': db_dict['_id']}, {'$set': {'postCode': '403'}})
            continue
        if titleForeign == title:
            db_storage.update_one({'_id': db_dict['_id']}, {'$set': {'postCode': '403'}})
            continue
        if columns != '戴姆勒中文官网' and name != '巴黎市长活动' and name != 'mylondon' and str(newsTime)[:-2] == str(
                CREATE_DATE)[:-2]:
            db_storage.update_one({'_id': db_dict['_id']}, {'$set': {'postCode': '403'}})
            continue
        if content == '译文来源：微软自动翻译':
            db_storage.update_one({'_id': db_dict['_id']}, {'$set': {'postCode': '2'}})
            continue
        if tag2 == '':
            db_storage.update_one({'_id': db_dict['_id']}, {'$set': {'postCode': '2'}})
            continue
        if richTextForeign == richText or contentForeign == content:
            db_storage.update_one({'_id': db_dict['_id']}, {'$set': {'postCode': '2'}})
            continue
        if title == '待审核，待更新' or richText == '待审核，待更新' or content == '待审核，待更新' or tag1 == '待审核，待更新' or tag2 == '待审核，待更新':
            db_storage.update_one({'_id': db_dict['_id']}, {'$set': {'postCode': '2'}})
            continue
        name = db_dict['name']
        if name == '人民网':
            columns = '人民网'
        sid = db_dict['sid'].strip()
        if str(sid) == ('284671'):
            try:
                img_results = re.findall('<img(.*?)>', richText)
            except:
                continue
            for img_result in img_results:
                img_url = img_result.split('src="')[-1].split('"')[0]
                img_tem = str(img_url)
                if img_tem.find('volkswagengroupchina') > -1:
                    tems = img_tem.split('\\')
                    img_tem = 'https://www.volkswagengroupchina.com.cn/MediaFile//Sync/' + tems[-2] + '/' + tems[-1]
                    log.info(f'替换内容链接==={img_tem}')
                richText = str(richText).replace(img_url, img_tem)
                db_dict['richText'] = richText
        try:
            pst_data = {
                "ids": str(db_dict["_id"]),
                "columns": columns,
                "titleForeign": clean(db_dict['titleForeign']),
                "title": clean(db_dict['title']),
                "contentForeign": clean(db_dict['contentForeign']),
                "content": clean(db_dict['content']),
                "newsTime": db_dict['newsTime'],
                "richTextForeign": clean(db_dict['richTextForeign']),
                "richText": clean(db_dict['richText'].replace("\\xF0\\x9F\\xA4\\xAD", '')),
                "tag1": db_dict['tag1'],
                "tag2": db_dict['tag2'],
                "columnUrl": db_dict['columnUrl'],
                "url": db_dict['url'],
                "name": db_dict['name'],
                "source": db_dict['source']
            }
        except Exception as e:
            log.error(e)
            continue
        log.info(title)
        try:
            postCode,msg = post_data(pst_data)
        except Exception as e:
            log.error(e)
            log.error('推送接口出错！')
            time.sleep(60)
            continue
        if str(postCode) == '0':
            if '该新闻已存在，不需要二次更新' in msg:
                continue
            # 根据pdf链接是否为空来推送pdf文件
            if pdfurl.strip():
                # 上传pdf
                pdf_data = {
                    "ids": str(db_dict["_id"]),
                    "columns": columns,
                    "title": clean(db_dict['title']),
                    "newsTime": db_dict['newsTime'],
                    "url": db_dict['url'],
                    "pdfurl": db_dict['pdfurl'],
                    "pdfFilename": db_dict['pdfFilename']
                }
                try:
                    postCode = post_filedata(pdf_data)
                except Exception as e:
                    pass

            log.info(f"推送==={db_dict['_id']}==={db_dict['columns']}==={newsTime}==={postCode}")
            now_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            db_storage.update_one({'_id': db_dict['_id']}, {'$set': {'postCode': str(postCode), 'postTime': now_time}})
            content = db_dict['CONTENT']
            try:
                img_results = re.findall('<img(.*?)>', content)
            except:
                continue
            for img_result in img_results:
                img_url = img_result.split('src="')[-1].split('"')[0]
                img_tem = str(img_url)
                if img_tem.find('volkswagengroupchina') > -1:
                    tems = img_tem.split('\\')
                    img_url = 'https://www.volkswagengroupchina.com.cn/MediaFile//Sync/' + tems[-2] + '/' + tems[-1]
                    log.info(f'转换后的链接==={img_url}')
                try:
                    img_headers = {
                        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
                    }
                    img_resp = requests.get(img_url, headers=img_headers, timeout=10).content
                    if len(img_resp) < 1:
                        continue
                except:
                    continue
                img_content = str(base64.b64encode(img_resp), encoding='utf-8')
                img_dict = {
                    'isd': str(db_dict["_id"]),
                    'img_href': img_url,
                    'img_content': img_content
                }
                try:
                    post_img(img_dict)
                except:
                    log.error('图片推送error')
                    continue
        elif str(postCode) == '101':
            log.info('中科软处理数据中！')
            time.sleep(60)
        else:
            log.error(f"推送失败==={db_dict['_id']}==={db_dict['columns']}==={newsTime}==={postCode}")


def clean(desstr, restr=''):
    # 匹配所有的表情符号
    # 过滤表情
    try:
        res = re.compile(u'[\U00010000-\U0010ffff]')
    except re.error:
        res = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
    return res.sub(restr, desstr)


if __name__ == '__main__':
    while True:
        try:
            pan_dun_and_tui_song()
            log.info('等待十分钟')
            time.sleep(600)
        except Exception as e:
            log.error(e)
            continue
