"""
Author: Tao Zhang
Desc: 基于字典的情感极性分析，用于年报情感分析。
2021-10-29: 实现分行业的数字化转型正负面趋势分析。
2021-11-05: 实现echart折线分析图的嵌入，API开发完成。
"""
import cx_Oracle
import pandas as pd
import os
import re
from tqdm import tqdm
import jieba
import json
import zipfile
from flask import Flask, request, make_response
from urllib.parse import quote
import io
from platform_zzsn.settings import BASE_DIR

# 开始加载情感词典
print('开始加载情感词典 ...')
reverse_words = ['车道偏离']  # 屏蔽词

negdict = []  # 消极情感词典
posdict = []  # 积极情感词典
nodict = []  # 否定词词典
plusdict = []  # 程度副词词典
sentiment_base_dir = os.path.join(BASE_DIR, 'static/base/sentiment_dict')
sl = pd.read_csv(os.path.join(sentiment_base_dir, '中文金融词典/dict/formal_neg.txt'), header=None, encoding='utf-8')
for i in range(len(sl[0])):
    negdict.append(sl[0][i])
# sl = pd.read_csv('情感极性词典/正面情绪词.txt', header=None, encoding='utf-8')
sl = pd.read_csv(os.path.join(sentiment_base_dir, '中文金融词典/dict/formal_pos.txt'), header=None, encoding='utf-8')
for i in range(len(sl[0])):
    posdict.append(sl[0][i])
sl = pd.read_csv(os.path.join(sentiment_base_dir, '情感极性词典/否定词.txt'), header=None, encoding='utf-8')
for i in range(len(sl[0])):
    nodict.append(sl[0][i])
sl = pd.read_csv(os.path.join(sentiment_base_dir, '情感极性词典/程度副词.txt'), header=None, encoding='utf-8')
for i in range(len(sl[0])):
    plusdict.append(sl[0][i])

print('情感词典加载完成！')
# 加载情感词典结束
for w in ['非公开', '非流动', '车联网', '网联化', '智能网联化', '智能网联', '新能源', '共享化']:
    jieba.add_word(w)


def clean_blank_lines(text):
    """
        清理多余空行
    :param text:
    :return:
    """
    text_ = re.sub('[\n]+', '\n', text.replace('\t', '').replace('\r', ''))
    return text_


def repaire_table_of_content(content):
    """
    修复目录格式
    eg:
    第一节重要提示、 第一节 重要提示、
    第四节九、公司未 第四节 九、公司未
    第一节重要提示、 第一节 重要提示、
    第二节公司简介和 第二节 公司简介和
    第三节公司业务概 第三节 公司业务概
    """

    chapter = re.findall(r'第\S{1,2}节\S{,5}', content)
    for i in chapter:
        i_s = i.split('节')
        new_cha = i_s[0] + '节' + ' ' + i_s[-1]
        # print(i, new_cha)
        content = content.replace(i, new_cha)
    return content


def filter4sentences(filter_keywords, sentences):
    """
    :param filter_keywords: 过滤词
    :param sentences: 待过滤的句子
    :return:
    """
    sentences_success = []
    total = 0
    for i in tqdm(sentences):
        total += len(i['句子'])
        for sent in i['句子']:
            for w in filter_keywords:
                if w in sent:
                    # print('success +1')
                    sentences_success.append({'年报文件名称': i['年报文件名称'],
                                              '年份': i['年份'],
                                              '股票简称': i['股票简称'],
                                              '句子': sent})
                    break
    print('句子总数：' + str(total) + '，筛选出的句子数量：' + str(len(sentences_success)))

    return sentences_success


# 预测方法
def predict(s, negdict, posdict, nodict, plusdict):
    p = 0

    for rw in reverse_words:  # 去掉文本中的屏蔽词
        if rw in s:
            s = s.replace(rw, '')

    sd = list(jieba.cut(s))  # 分词
    temp = {'积极词': [], '消极词': [], '副词': [], '否定词': []}
    for i in range(len(sd)):
        if sd[i] in negdict:
            if i > 0 and sd[i - 1] in nodict:
                p = p + 1
                temp['消极词'].append((i, sd[i]))
                temp['否定词'].append((i - 1, sd[i - 1]))
            elif i > 0 and sd[i - 1] in plusdict:
                p = p - 2
                temp['消极词'].append((i, sd[i]))
                temp['副词'].append(sd[i - 1])
            else:
                p = p - 1
                temp['消极词'].append((i, sd[i]))
        elif sd[i] in posdict:
            if i > 0 and sd[i - 1] in nodict:
                p = p - 1
                temp['积极词'].append((i, sd[i]))
                temp['否定词'].append((i - 1, sd[i - 1]))
            elif i > 0 and sd[i - 1] in plusdict:
                p = p + 2
                temp['积极词'].append((i, sd[i]))
                temp['副词'].append((i - 1, sd[i - 1]))
            elif i > 0 and sd[i - 1] in negdict:
                p = p - 1
                temp['积极词'].append((i, sd[i]))
                temp['消极词'].append((i - 1, sd[i - 1]))
            elif i < len(sd) - 1 and sd[i + 1] in negdict:
                p = p - 1
                temp['积极词'].append((i, sd[i]))
                temp['消极词'].append((i + 1, sd[i + 1]))
            else:
                p = p + 1
                temp['积极词'].append((i, sd[i]))
        elif sd[i] in nodict:
            p = p - 0.5
            temp['否定词'].append((i, sd[i]))
    temp_u = {}
    for k, v in temp.items():
        temp_u[k] = list(set(v))
    return p, sd, temp_u


def get_echart_line_map(years_range_list, positive_count_list, negative_count_list, title='数字化转型正负面趋势分析'):
    """
        生成折线图
    :param years_range_list: 年份区间
    :param positive_count_list: 正面数量
    :param negative_count_list: 负面数量
    :param title 标题
    :return:
    """

    with open('input_data/echart_line_template.html', 'r') as file:
        html = file.read()
        html = html.replace('years_range_list', str(years_range_list))
        html = html.replace('positive_count_list', str(positive_count_list))
        html = html.replace('negative_count_list', str(negative_count_list))

    with open(os.path.join('outputs', title + '_' + 'index.html'), 'w') as file2:
        file2.write(html)


def sentiment_analysis(years_range_list, sentences, path):
    """
    :param years_range_list: 分析年份区间
    :param sentences: 多个句子
    :return:
    """
    mydata = pd.DataFrame(data=sentences)
    len1 = len(mydata)
    mydata.drop_duplicates(subset=['句子'], inplace=True)
    mydata.reset_index(drop=True, inplace=True)
    print('去重数量为：' + str(len1 - len(mydata)) + '，剩余条数：' + str(len(mydata)))
    tol = 0
    # mydata['pred'] = 0
    for i in tqdm(range(len(mydata))):
        tol = tol + 1
        score, sd, info = predict(mydata.loc[i, '句子'], negdict, posdict, nodict, plusdict)
        mydata.loc[i, '分词'] = ' '.join(sd)
        if score > 0:
            mydata.loc[i, 'pred'] = 1  # 积极
            mydata.loc[i, 'info'] = json.dumps(info, ensure_ascii=False)
        elif score < 0:
            mydata.loc[i, 'pred'] = 0  # 消极
            mydata.loc[i, 'info'] = json.dumps(info, ensure_ascii=False)
        else:
            mydata.loc[i, 'pred'] = -2  # 无情感
            mydata.loc[i, 'info'] = json.dumps(info, ensure_ascii=False)

    print(mydata.head(10))

    mydata.to_excel(os.path.join(path, '分析结果_test.xlsx'), index=False, columns=['年报文件名称', '年份', '股票简称',
                                                                    '句子', '分词', 'pred', 'info'])
    mydata_year_set = set(mydata['年份'].tolist())
    positive_count = []
    negative_count = []
    for year in years_range_list:
        if year in mydata_year_set:
            # 取数据
            df_current = mydata[mydata['年份'] == year]
            positive_count.append(len(df_current[df_current['pred'] == 1]))
            negative_count.append(len(df_current[df_current['pred'] == 0]))
        else:
            positive_count.append(0)
            negative_count.append(0)

    # get_echart_line_map(years_range_list, positive_count, negative_count)
    return True


def process_v2(IndustryCode='36', start_year=2016, stop_year=2020, path='./'):
    """
         从数据库里检索符合条件的内容
    :param IndustryCode: 行业大类代码
    :param start_year: 起始年份
    :param stop_year: 终止年份
    :return:
    """
    year_range = [str(y) for y in range(int(start_year), int(stop_year) + 1)]
    guanliceng_content = []  # 存储管理层文本
    success = 0  # 统计成功提取数量
    # data_root_path = '../../../东方财富网/'  # .txt格式年报数据根路径
    filter_keywords_path = os.path.join(sentiment_base_dir, '数字化转型_词库.xlsx')
    types = {'行业大类代码': str, '上市公司代码': str}
    df = pd.read_excel(os.path.join(sentiment_base_dir, '截至2021年2季度上市公司_4352家【From证监会】.xlsx'), dtype=types)
    IndustryCode2info = {}
    for idx, row in df.iterrows():
        if row['行业大类代码'] not in IndustryCode2info:
            IndustryCode2info[row['行业大类代码']] = dict(row)
    print('行业大类数量：' + str(len(IndustryCode2info)))
    # print('数据库中，行业大类数量：' + str(len(os.listdir(data_root_path))))

    connect = cx_Oracle.connect('cis', 'cis_zzsn9988', '114.116.91.1:1521/ORCL')
    cursor = connect.cursor()

    if IndustryCode in df['行业大类代码'].tolist():
        df_useful = df[df['行业大类代码'] == IndustryCode]
        print(df_useful.head(10))
        IndustryName = IndustryCode2info[IndustryCode]['行业大类名称']
        print('正在分析：' + IndustryName + ' ...')
        sql_str = "SELECT TITLE, COMPLETE_SENTENCES, YEAR, STOCK_NAME FROM COMPANY_ANNUAL_REPORT WHERE INDUSTRY_CODE='%s' AND YEAR BETWEEN '%s' AND '%s'" % (
            str(IndustryCode), str(start_year), str(stop_year))
        print('\n' + sql_str + '\n\n')
        cursor.execute(sql_str)
        data = cursor.fetchall()
        cursor.execute('commit')
        print(data[0: 10])
        print('step2：完整句子提取【开始】 ...')
        complete_sentences = []
        for i in tqdm(data):
            sentences_bytes = i[1].read()
            sentences_str = sentences_bytes.decode('utf8')  # 对字节解码，转为str类型
            # content_bytes2 = content_str.encode('utf8')  # 对str类型编码，转为字节类型
            temp_dict = {'年报文件名称': i[0],
                         '年份': i[2],
                         '股票简称': i[3],
                         '句子': sentences_str.split('<sep>')}
            complete_sentences.append(temp_dict)
            # print(json.dumps(temp_dict, ensure_ascii=False, indent=2))
        # json.dump(complete_sentences, open(save_path, 'w'),
        #           ensure_ascii=False, indent=2)

        print('step2：完整句子提取【完成】')

        print('step3：符合条件的句子筛选【开始】 ...')
        keywords = []
        for sheet_name in ['来源【政策库】', '来源【模型推荐】']:
            df1 = pd.read_excel(filter_keywords_path, sheet_name=sheet_name)
            if sheet_name == '来源【政策库】':
                df1 = df1[df1['label'] == 1]
            keywords.extend(df1['关键词'].to_list())
        sentences_useful = filter4sentences(keywords, complete_sentences)
        print(sentences_useful[0: 5])
        # for item in sentences_useful:
        #     print(item)
        print('step3：符合条件的句子筛选【完成】')

        print('step4: 正负面预测开始 ...')
        results = sentiment_analysis(year_range, sentences_useful, path)
        print('step4: 正负面预测完成！')

        # print('step5: 打包结果 ...')
        # zip_directory('outputs', 'results_zip/分析结果.zip')
        # print('step5: 结果打包完成')
        return {"success": 1, "msg": "{} 行业分析完成！".format(IndustryCode)}
    else:
        print('Sorry! IndustryCategoryCode 不在”行业大类代码“中。请检查！')

