import datetime
import io
import json
import os
import random
import re
import time
import uuid

import pandas as pd
import pymysql
import redis
import requests
from bs4 import BeautifulSoup
from minio import Minio
from retry import retry
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

from base import BaseCore

baseCore = BaseCore.BaseCore()
log = baseCore.getLogger()
cursor_ = baseCore.cursor
cnx_ = baseCore.cnx
cnx = pymysql.connect(host='114.115.159.144', user='caiji', password='zzsn9988', db='clb_project', charset='utf8mb4')
client = Minio('114.115.215.96:9089', access_key='zzsn@9988@!', secret_key='zzsn@9988@!0519', secure=False)
create_by = 'LiuLiYuan'


# 数据入库，返回主键id传到kafka中
def tableUpdate(year, com_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
                create_by, create_time, come, page_size, cnx):
    with cnx.cursor() as cursor:
        Upsql = '''insert into clb_sys_attachment(year,name,type_id,item_id,group_name,path,full_path,category,file_size,order_by,status,create_by,create_time,source,page_size) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''

        values = (
            year, com_name, type_id, item_id, group_name, path, full_path, category, file_size, order_by, status,
            create_by,
            create_time, come, page_size)
        # print(values)
        cursor.execute(Upsql, values)  # 插入
        cnx.commit()  # 提交

        querySql = '''select id from clb_sys_attachment where full_path = %s'''  # and stock_code = "01786.HK"
        cursor.execute(querySql, full_path)
        selects = cursor.fetchone()
        pdf_id = selects[0]
    # cnx.close()
    # print("更新完成:{}".format(pdf_id))
    return pdf_id


# redis去重
def add_check_id(uid, mid):
    r = redis.Redis(host="114.116.90.53", port=6379, password='zzsn9988', db=3)
    res = r.sadd(f'weibo:{uid}', mid, 3)  # 注意是 保存set的方式
    if res == 0:  # 若返回0,说明插入不成功，表示有重复
        return True
    else:
        return False


# 登录账号并获取cookie
def get_cookie(driver):
    path = 'F:/spider/117/chromedriver-win64/chromedriver.exe'
    driver.get("https://weibo.com/")
    while True:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//*[@id=\"app\"]")))
            break
        except:
            driver.quit()
            driver = baseCore.buildDriver(path, False)
            driver.get("https://weibo.com/")
    driver.find_element(By.CLASS_NAME, 'LoginCard_btn_Jp_u1').click()
    while True:
        flg = False
        for cookie in driver.get_cookies():
            if 'SSOLoginState' == cookie['name']:
                flg = True
        if flg:
            break
        else:
            time.sleep(5)
    cookie_list = driver.get_cookies()

    cookies = {}
    # 获取cookie中的name和value,转化成requests可以使用的形式
    for cookie in cookie_list:
        cookies[cookie['name']] = cookie['value']
    return cookies, driver


# 重新获取cookie
def get_cookie_again(driver):
    driver.refresh()
    cookies = {}
    cookie_list = driver.get_cookies()
    for cookie in cookie_list:
        cookies[cookie['name']] = cookie['value']
    return cookies


# 获取请求
@retry(tries=5, delay=5)
def getRes(session, url_one_con, headers):
    res_one_con = session.get(url_one_con, headers=headers)  # 对具体文章页面进行请求，获得文章内容
    if res_one_con.status_code != 200:
        raise
    return res_one_con


# 获取博主文章的总数
def getTotal(session, uid, headers):
    url_total = f"https://weibo.com/ajax/statuses/mymblog?uid={uid}&page=1&feature=0"
    req_total = session.get(url_total, headers=headers).json()
    total = req_total['data']['total']
    if total % 20 == 0:
        num_page = total // 20
    else:
        num_page = total // 20 + 1
    return num_page


# 日期格式解析
def trs_date(date_str):
    # 使用datetime.strptime解析日期字符串
    date_obj = datetime.datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')

    # 转化为年月日时间格式
    formatted_date = date_obj.strftime('%Y-%m-%d %H:%M:%S')

    return formatted_date


# 获取展开的所有内容
def getLongContent(session, mblogid):
    mblogurl = f'https://weibo.com/ajax/statuses/longtext?id={mblogid}'
    mblogreq = session.get(mblogurl)
    mblogreq.encoding = mblogreq.apparent_encoding
    mblogreqdata = mblogreq.json()['data']
    content = mblogreqdata['longTextContent']
    topic_structs = mblogreqdata['topic_struct']
    for topic_struct in topic_structs:
        topic_title = topic_struct['topic_title']
        topic_content = f'''<a href="//s.weibo.com/weibo?q=#{topic_title}#" target="_blank">#{topic_title}#</a>'''
        content.replace(f'#{topic_title}#', topic_content)
    url_structs = mblogreqdata['url_struct']
    for url_struct in url_structs:
        short_url = url_struct['short_url']
        content.replace(short_url, '')
        title = url_struct['url_title']
        long_url = url_struct['long_url']
        title_content = f'''<a target="_blank" href="{long_url}"><img class="icon-link" src="https://h5.sinaimg.cn/upload/2015/09/25/3/timeline_card_small_web_default.png"/>{title}</a>'''
        content += title_content
    return content


# 下载展示图片
def getPic(session, pic_infos, mid, uid, origin, year):
    pic_list = []
    for pic_info in pic_infos:
        pic_url = pic_info['large']['url']
        category = os.path.splitext(pic_url)[1]
        img_name = f'{uuid.uuid1()}{category}'
        if 'jpg' in category or 'jpeg' in category:
            content_type = 'image/jpeg'
        elif 'png' in category:
            content_type = 'image/png'
        elif 'gif' in category:
            content_type = 'image/gif'
        try:
            req = session.get(pic_url)
            req.encoding = req.apparent_encoding
            res_content = io.BytesIO(req.content)
            size = res_content.getbuffer().nbytes
            # 将文件用数据流下载到服务器
            result = client.put_object('jcxm', f'img/微博/{mid}/{img_name}', res_content, size, content_type=content_type)
            group_name = 'jcxm'
            path = f'img/微博/{mid}/{img_name}'
            full_path = path
            file_size = size
            order_by = 1
            status = 1
            create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            img_id = tableUpdate(year, img_name, 15, mid, group_name, path, full_path,
                                 category, file_size, order_by, status, create_by, create_time, origin, 0, cnx)
            pic_list.append(img_id)
        except:
            log.error(f"{mid}...{uid}下载失败")
            return pic_list, False
    return pic_list, True


# 下载跳转类图片
def getPage(session, page_info, long_url, mid, uid, origin, year):
    page_list = []
    object_type = page_info['object_type']
    if object_type == 'video':
        long_url = page_info['media_info']['h5_url']
        video_id, flg = getVedio(session, page_info, mid, year, origin)
        if flg:
            page_list.append([video_id, long_url, object_type])
            return page_list, True
        else:
            return page_list, False
    else:
        url = page_info['page_pic']
        category = os.path.splitext(url)[1]
        img_name = f'{uuid.uuid1()}{category}'
        if 'jpg' in category or 'jpeg' in category:
            content_type = 'image/jpeg'
        elif 'png' in category:
            content_type = 'image/png'
        elif 'gif' in category:
            content_type = 'image/gif'
        try:
            req = session.get(url)
            req.encoding = req.apparent_encoding
            res_content = io.BytesIO(req.content)
            size = res_content.getbuffer().nbytes
            # 将文件用数据流下载到服务器
            result = client.put_object('jcxm', f'img/微博/{mid}/{img_name}', res_content, size, content_type=content_type)
            group_name = 'jcxm'
            path = f'img/微博/{mid}/{img_name}'
            full_path = path
            file_size = size
            order_by = 1
            status = 1
            create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
            img_id = tableUpdate(year, img_name, 15, mid, group_name, path, full_path,
                                 category, file_size, order_by, status, create_by, create_time, origin, 0, cnx)
            page_list.append([img_id, long_url, object_type])
        except:
            log.error(f"{mid}...{uid}下载失败")
            return page_list, False
        return page_list, True


# 获取转发内容
def getForward(session, mid, uid, origin, year, one_con):
    forwardUser = one_con['user']['screen_name']
    content = one_con['text']
    if '>展开<' in content:
        mblogid = one_con['mblogid']
        content = getLongContent(session, mblogid)
    else:
        contentTag = BeautifulSoup(content, 'lxml')
        a_list = contentTag.find_all('a')
        for a in a_list:
            href = a.get('href')
            if '@' in a.text:
                href_ = 'https://weibo.com' + href
                content.replace(href, href_)
            elif '#' in a.text:
                href_ = 'https:' + href
                content.replace(href, href_)
            else:
                continue
        content = str(contentTag)
    # 获取图片链接
    # 图片需要下载
    pic_list = []
    try:
        pic_infos = one_con['pic_infos']
        pic_list, flg = getPic(session, pic_infos, mid, uid, origin, year)
        if not flg:
            return False
    except:
        pass
    page_list = []
    try:
        page_info = one_con['page_info']
        long_url = one_con['url_struct'][0]['long_url']
        page_list, flg = getPage(session, page_info, long_url, mid, uid, origin, year)
        if not flg:
            return False
    except:
        pass
    return True


# 下载视频   视频链接可下载
def getVedio(session, page_info, mid, year, origin):
    title = page_info['kol_title']
    try:
        href = page_info['mp4_720p_mp4']
        if not href:
            href = page_info['mp4_hd_url']
        if not href:
            href = page_info['mp4_sd_url']
        if not href:
            raise
    except:
        log.error(f'{mid}==={title}===视频链接获取失败')
        return '', False
    try:
        req = session.get(href)
        res_content = io.BytesIO(req.content)
        size = res_content.getbuffer().nbytes
        # 将文件用数据流下载到服务器
        result = client.put_object('jcxm', f'img/微博/{mid}/{title}.mp4', res_content, size, content_type='video/mp4')
    except:
        log.error(f'{mid}==={title}===上传minio失败')
        return '', False
    group_name = 'jcxm'
    path = f'video/微博/{mid}/{title}.mp4'
    full_path = path
    file_size = size
    order_by = 1
    status = 1
    create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    try:
        video_id = tableUpdate(year, title, 16, mid, group_name, path, full_path,
                               'mp4', file_size, order_by, status, create_by, create_time, origin, 0, cnx)
    except:
        log.error(f'{mid}==={title}===上传附件表失败')
        return '', False
    return video_id, True


# 代码主程序，通过给出的用户url来获取用户发布的文章
def get_content_by_user_uid(url, sid):
    path = 'F:/spider/117/chromedriver-win64/chromedriver.exe'
    driver = baseCore.buildDriver(path, False)
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
    }
    session = requests.session()
    cookies_str, driver = get_cookie(driver)
    cookies = json.loads('{' + re.findall("{(.*?)}", str(cookies_str).replace("\'", "\""))[0] + '}')
    session.cookies.update(cookies)
    # 获取到统一格式的名称，用来查询微博uid
    if url[-1] == "/":
        url = url[:-1]
    if "?" not in url:
        url_get_uid = "https://weibo.com/ajax/profile/info?custom=" + url.split('/')[-1]
    else:
        if "%" not in url:
            url_get_uid = "https://weibo.com/ajax/profile/info?custom=" + url.split('/')[-1].split('?')[0]
        else:
            url_get_uid = "https://weibo.com/ajax/profile/info?screen_name=" + url.split('/')[-1].split('?')[0]
    try:
        res_get_uid_json = session.get(url_get_uid, headers=headers).json()
        weibo_name = res_get_uid_json['data']['user']['screen_name']  # 微博号名称
        uid = res_get_uid_json['data']['user']['id']  # 微博uid
        origin = "微博-" + weibo_name
    except:
        print(f"{url}:uid获取失败")
        return
    num_page = getTotal(session, uid, headers)
    log.info(f'开始采集共{num_page}页')
    # 爬取程序入口
    start_time = time.time()
    for page in range(1, num_page):
        page_flg = True
        log.info(f'开始采集第{page}页')
        try:
            url_all_con = f"https://weibo.com/ajax/statuses/mymblog?uid={uid}&page={page}&feature=0"  # 使用uid找到每个微博的所有文章
            res_all_con_json = session.get(url_all_con, headers=headers).json()
            list_all_con = res_all_con_json['data']['list']  # 每页微博文章为json类，取出需要的数据
        except Exception as e:
            log.error("{}的{}页获取失败".format(weibo_name, page), e, sep='===')
            continue
        for one_con in list_all_con:
            equ_source = one_con['source'].replace('\n', '')  # 信息发布方式
            like = int(one_con['attitudes_count'])  # 点赞数
            commentNum = int(one_con['comments_count'])  # 评论数
            collection = int(one_con['reposts_count'])  # 转发数
            mid = one_con['mid']  # 文章id
            if add_check_id(uid, mid):
                log.info(f'{uid}==={mid}===已采集')
                continue
            publishDate = trs_date(one_con['created_at'])  # 发布时间
            year = publishDate[:4]
            if publishDate < '2023-08-01':
                page_flg = False
                break
            # 获取微博文字内容
            content = one_con['text']
            if '>展开<' in content:
                mblogid = one_con['mblogid']
                content = getLongContent(session, mblogid)
            else:
                contentWithTag = BeautifulSoup(content, 'lxml')
                a_list = contentWithTag.find_all('a')
                for a in a_list:
                    href = a.get('href')
                    if '@' in a.text:
                        href_ = 'https://weibo.com' + href
                        content.replace(href, href_)
                    elif '#' in a.text:
                        href_ = 'https:' + href
                        content.replace(href, href_)
                    else:
                        continue
                content = contentWithTag.text
                contentWithTag = str(contentWithTag)
            try:
                title = re.findall('【(.*?)】',content)[0]
            except:
                title = content
            # 获取图片链接
            # 图片需要下载
            pic_list = []
            try:
                pic_infos = one_con['pic_infos']
                pic_list, flg = getPic(session, pic_infos, mid, uid, origin, year)
                if not flg:
                    continue
            except:
                pass
            page_list = []
            try:
                page_info = one_con['page_info']
                long_url = one_con['url_struct'][0]['long_url']
                page_list, flg = getPage(session, page_info, long_url, mid, uid, origin, year)
                if not flg:
                    continue
            except:
                pass
            forward = {}
            try:
                retweeted_status = one_con['retweeted_status']
                forward, flg = getForward(session, mid, uid, origin, year, retweeted_status)
                if not flg:
                    continue
            except:
                pass
            log.info(
                f'{uid}==={mid}==={equ_source}==={like}==={commentNum}==={collection}==={publishDate}==={content}==={pic_list}==={page_list}')
            time.sleep(random.uniform(3, 5))
        log.info("{}的{}页获取成功".format(weibo_name, page))
        end_time = time.time()
        if end_time - start_time > 1800:
            cookies_str = get_cookie_again(driver)
            cookies = json.loads('{' + re.findall("{(.*?)}", str(cookies_str).replace("\'", "\""))[0] + '}')
            session.cookies.update(cookies)
            start_time = time.time()
            log.info(f'已重新获取cookie')
        if not page_flg:
            break


if __name__ == "__main__":
    # get_content_by_user_uid('https://weibo.com/u/1689572847', '1571698920447193090')
    baseCore.close()
