# -*- coding: utf-8 -*-
'''
记录一天能采多少公众号,建一个数据库表 更新公众号的状态
'''

import requests, time, random, json, pymysql, redis
import pandas as pd
import urllib3
from bs4 import BeautifulSoup
from openpyxl import Workbook
from selenium import webdriver
from obs import ObsClient
from kafka import KafkaProducer

# logging.basicConfig(filename='example.log', level=logging.INFO)

from base.BaseCore import BaseCore
import os
baseCore = BaseCore()
log = baseCore.getLogger()
cnx_ = baseCore.cnx
cursor_ = baseCore.cursor
cnx = pymysql.connect(host="114.116.44.11", user="root", password="f7s0&7qqtK", db="clb_project", charset="utf8mb4")
cursor = cnx.cursor()
r = baseCore.r
urllib3.disable_warnings()


def check_url(sid, article_url):
    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
    res = r.sismember(f'wx_url_{sid}',article_url)
    if res == 1:
        return True
    else:
        return False


def add_url(sid, article_url):
    r = redis.Redis(host="114.115.236.206", port=6379, password='clbzzsn')
    res = r.sadd(f'wx_url_{sid}', article_url, 3)  # 注意是 保存set的方式
    if res == 0:  # 若返回0,说明插入不成功，表示有重复
        return True
    else:
        return False


def get_proxy():
    cnx = pymysql.connect(host="114.115.159.144", user="root", password="zzsn9988", db="clb_project", charset="utf8mb4")
    with cnx.cursor() as cursor:
        sql = "select proxy from clb_proxy"
        cursor.execute(sql)
        proxy_lists = cursor.fetchall()
        ip_list = []
        for proxy_ in proxy_lists:
            ip_list.append(str(proxy_).replace("('", '').replace("',)", ''))
        proxy_list = []
        for str_ip in ip_list:
            str_ip_list = str_ip.split('-')
            proxyMeta = "http://%(host)s:%(port)s" % {
                "host": str_ip_list[0],
                "port": str_ip_list[1],
            }
            proxy = {
                "HTTP": proxyMeta,
                "HTTPS": proxyMeta
            }
            proxy_list.append(proxy)
    return proxy_list


def get_info(sid,json_search):
    num_caiji = 0
    kaishi_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    obsClient = ObsClient(
        access_key_id='VEHN7D0TJ9316H8AHCAV',  # 你的华为云的ak码
        secret_access_key='heR353lvSWVPNU8pe2QxDtd8GDsO5L6PGH5eUoQY',  # 你的华为云的sk
        server='https://obs.cn-north-1.myhuaweicloud.com'  # 你的桶的地址
    )
    list_all_news = json_search['app_msg_list']

    for one_news in list_all_news:
        news_title = one_news['title']
        timestamp = one_news['create_time']
        time_local = time.localtime(timestamp)
        news_date = time.strftime("%Y-%m-%d %H:%M:%S", time_local)

        url_news = one_news['link']

        url_ft = check_url(sid, url_news)
        if url_ft:
            return list_all_info,num_caiji
        try:
            res_news = requests.get(url_news, timeout=20)
        except:
            continue
        soup_news = BeautifulSoup(res_news.content, 'html.parser')

        news_html = soup_news.find('div', {'id': 'js_content'})
        try:
            del news_html['style']
            del news_html['id']
            del news_html['class']
        except:
            pass
        try:
            news_content = news_html.text
        except:
            log.info(f'--------内容为空--------{url_news}--------')
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            false = [
                news_title,
                url_news,
                news_html,
                '文章内容为空',
                time_now
            ]
            insertSql = f"insert into WeixinGZH (site_name,site_url,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s)"
            cursor_.execute(insertSql, tuple(false))
            cnx_.commit()
            continue

        list_img = news_html.find_all('img')
        for num_img in range(len(list_img)):
            img_one = list_img[num_img]

            url_src = img_one.get('data-src')
            # print(url_src)
            if 'gif' in url_src:
                url_img = ''
                img_one.extract()
            else:
                try:
                    name_img = url_src.split('/')[-2] + '.' + url_src.split('wx_fmt=')[1]
                except:
                    img_one.extract()
                    continue
                try:
                    res = requests.get(url_src, timeout=20)
                except:
                    img_one.extract()
                resp = obsClient.putContent('zzsn', name_img, content=res.content)

                url_img = resp['body']['objectUrl']
                str_url_img = f'<img src="{url_img}">'
                img_one.replace_with(BeautifulSoup(str_url_img, 'lxml').img)

        for tag in news_html.descendants:
            try:
                del tag['style']
            except:
                pass

        list_section = news_html.find_all('section')
        for section in list_section:
            section.name = 'div'

        time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        dic_info = {
            'sid': sid,
            'title': news_title,
            'content': news_content,
            'contentWithtag': str(news_html),
            'summary': '',
            'author': '',
            'origin': origin,
            'publishDate': news_date,
            'sourceAddress': url_news,
            'source': '11',
            'createDate': time_now
        }
        for nnn in range(0, 3):
            try:
                producer = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
                kafka_result = producer.send("crawlerInfo", json.dumps(dic_info, ensure_ascii=False).encode('utf8'))
                kafka_time_out = kafka_result.get(timeout=10)
                # add_url(sid, url_news)
                break
            except:
                time.sleep(5)
                continue
        num_caiji = num_caiji + 1
        list_all_info.append(dic_info)

    time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    dic_info2 = {
        'infoSourceId': sid,
        'code': info_source_code,
        'num': num_caiji,
        'collectTime': kaishi_time,
        'dispatcherTime': time_now,
        'dispatcherStatus': '1',
        'source': '1',
    }
    for nnn2 in range(0, 3):
        try:
            producer2 = KafkaProducer(bootstrap_servers=['114.115.159.144:9092'])
            kafka_result2 = producer2.send("collectionAndDispatcherInfo",
                                           json.dumps(dic_info2, ensure_ascii=False).encode('utf8'))
            break
        except:
            time.sleep(5)
            continue
    return list_all_info,num_caiji

#定时
def getFromSql():
    selectSql = "SELECT info_source_code from info_source where site_uri like '%mp.weixin.qq.com%'"
    cursor.execute(selectSql)
    results = cursor.fetchall()
    result_list = [item[0] for item in results]

    #放入redis
    for item in result_list:
        r.rpush('WeiXinGZH:infoSourceCode', item)

#刷新浏览器并获得token
def flushAndGetToken(list_b):
    browser_run = list_b[0]
    log.info('======刷新浏览器=====')
    browser_run.refresh()
    cookie_list = browser_run.get_cookies()
    cur_url = browser_run.current_url
    token = cur_url.split('token=')[1]
    log.info(f'===========当前token为：{token}============')
    cookies = {}
    for cookie in cookie_list:
        cookies[cookie['name']] = cookie['value']
    return token,cookies

#采集失败的公众号 重新放入redis
def rePutIntoR(item):
    r.rpush('WeiXinGZH:infoSourceCode', item)

if __name__=="__main__":

    time_start = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    print(f'开始时间为：{time_start}')

    requests.adapters.DEFAULT_RETRIES = 3
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
    }

    opt = webdriver.ChromeOptions()
    opt.add_argument(
        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')

    opt.add_argument("--ignore-certificate-errors")
    opt.add_argument("--ignore-ssl-errors")
    opt.add_experimental_option("excludeSwitches", ["enable-automation"])
    opt.add_experimental_option('excludeSwitches', ['enable-logging'])
    opt.add_experimental_option('useAutomationExtension', False)
    # opt.binary_location =r'D:\crawler\baidu_crawler\tool\Google\Chrome\Application\chrome.exe'
    # chromedriver = r'C:\Users\WIN10\DataspellProjects\crawlerProjectDemo\tmpcrawler\cmd100\chromedriver.exe'
    chromedriver = r'D:/chrome/chromedriver.exe'
    browser1 = webdriver.Chrome(chrome_options=opt, executable_path=chromedriver)

    list_b = [browser1]
    url = "https://mp.weixin.qq.com/"
    browser1.get(url)

    # 可改动
    time.sleep(30)
    num_b = 0

    # todo:从数据库里读信息，放入redis，定时任务 每天放入数据
    # getFromSql()

    s = requests.session()
    # 记录运行公众号的个数
    count = 0
    while True:
        # 刷新浏览器并获取当前token和cookie
        token, cookies = flushAndGetToken(list_b)
        list_all_info = []
        log.info('===========获取公众号============')
        start_ = time.time()
        # #todo:redis中数据 pop一条
        infoSourceCode = baseCore.redicPullData('WeiXinGZH:infoSourceCode')
        if infoSourceCode == 'None':
            #当一次采集完之后，重新插入数据并等待插入完成
            getFromSql()
            time.sleep(20)
            log.info(f'========本次公众号已采集完毕，共采集{count}个公众号=========总耗时：{baseCore.getTimeCost(start_,time.time())}')
            continue

        sql = f"SELECT site_uri,id,site_name,info_source_code from info_source where info_source_code = '{infoSourceCode}' "
        # '一带一路百人论坛'
        # sql = f"SELECT site_uri,id,site_name,info_source_code from info_source where site_name = '一带一路百人论坛' "
        cursor.execute(sql)
        row = cursor.fetchone()

        dic_url = {
            'url_': row[0],
            'sid': row[1],
            'name': row[2],
            'info_source_code': row[3],
            'biz': ''
        }

        log.info('===========获取biz==========')
        s.cookies.update(cookies)
        url_ = dic_url['url_']
        origin = dic_url['name']
        info_source_code = dic_url['info_source_code']
        sid = dic_url['sid']
        try:
            biz = url_.split('__biz=')[1].split('==&')[0].split('=')[0]
            dic_url['biz'] = biz
        except Exception as e:
            log.info(f'---公众号--{origin}---biz错误')
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            error = [
                origin,
                url_,
                info_source_code,
                e,
                'biz错误',
                time_now
            ]
            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
            cursor_.execute(insertSql, tuple(error))
            cnx_.commit()
            continue
        fakeid = biz + '=='

        url_search = f'https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=5&count=5&fakeid={fakeid}&type=9&query=&token={token}&lang=zh_CN&f=json&ajax=1'
        try:
            ip = get_proxy()[random.randint(0, 3)]
            json_search = s.get(url_search, headers=headers, proxies=ip,
                                verify=False).json()  # , proxies=ip, verify=False
            str_t = json.dumps(json_search)
            time.sleep(2)

        except:
            log.error(f'===公众号{origin}请求失败！当前时间：{baseCore.getNowTime(1)}===')
            rePutIntoR(info_source_code)
            continue
        #{"base_resp": {"ret": 200003, "err_msg": "invalid session"}}
        # TODO:需要判断返回值，根据返回值判断是封号还是biz错误
        # {'base_resp': {'err_msg': 'freq control', 'ret': 200013}}=========   封号
        # {'base_resp': {'err_msg': 'invalid args', 'ret': 200002}}    公众号biz错误 链接
        # 'base_resp': {'err_msg': 'ok', 'ret': 0} 正常
        ret = json_search['base_resp']['ret']
        if ret == 0:
            pass
        elif ret == 200013:
            # 重新放入redis
            # time.sleep(3600)
            # 刷新 暂时用一下方法
            rePutIntoR(info_source_code)
            log.info(f'======该账号被封=======')
            for i in range(0,6):   #600,1200,1800,2400,3000,3600
                #刷新
                wait_time = time.sleep(600)
                log.info(f'=======等待时间{wait_time}秒=====刷新浏览器=====')
                browser_run = list_b[0]
                browser_run.refresh()
            continue
        elif ret == 200002:
            # 公众号链接错误 保存库里 记录错误信息及错误类型
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            error = [
                origin,
                url_,
                info_source_code,
                str_t,
                '无效biz参数',
                time_now
            ]
            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
            cursor_.execute(insertSql, tuple(error))
            cnx_.commit()
            log.info(f'公众号----{origin}----耗时{baseCore.getTimeCost(start_,time.time())}')
            continue
        elif ret == 200003:
            # 无效的session
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            error = [
                origin,
                url_,
                info_source_code,
                str_t,
                '无效session',
                time_now
            ]
            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
            cursor_.execute(insertSql, tuple(error))
            cnx_.commit()
            log.info(f'公众号----{origin}----耗时{baseCore.getTimeCost(start_, time.time())}')
            continue
        else:
            log.info(f'----其他情况-----{json_search}---公众号{origin}------')
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            error = [
                origin,
                url_,
                info_source_code,
                str_t,
                '其他错误',
                time_now
            ]
            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
            cursor_.execute(insertSql, tuple(error))
            cnx_.commit()
            continue

        list_all = json_search['app_msg_list']
        try:
            list_all_info,num_caiji= get_info(sid,json_search)
            time.sleep(2)
            if len(list_all_info) != 0:
                count += 1
                time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                success = [
                    origin,
                    url_,
                    info_source_code,
                    num_caiji,
                    '采集成功',
                    time_now
                ]
                #成功信息保存
                insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,success_info,success_num,create_time) values (%s,%s,%s,%s,%s,%s)"
                cursor_.execute(insertSql, tuple(success))
                cnx_.commit()
                # 该公众号的所有文章采集完成
                log.info(f'{fakeid}、公众号{origin}:采集成功！、已采集{count}个公众号、耗时{baseCore.getTimeCost(start_,time.time())}')
            else:
                log.info(f'{fakeid}、公众号{origin}、网址已存在！耗时{baseCore.getTimeCost(start_,time.time())}')
        except Exception as e:
            # json解析该公众号成功但采集数据失败
            count += 1
            time_now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            false = [
                origin,
                url_,
                info_source_code,
                e,
                '采集失败',
                time_now
            ]
            # 成功信息保存
            insertSql = f"insert into WeixinGZH (site_name,site_url,info_source_code,json_error_info,error_type,create_time) values (%s,%s,%s,%s,%s,%s)"
            cursor_.execute(insertSql, tuple(false))
            cnx_.commit()
            log.info(f'{fakeid}、公众号：{origin}采集失败！！！！！！耗时{baseCore.getTimeCost(start_, time.time())}')

        time.sleep(2)
    #关闭资源
    cnx.close()
    cursor.close()
    baseCore.close()


