#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @File    : test_label.py
# @Time    : 2022/1/7 18:28
# @Author  : Mr.Ygg
# @Software: PyCharm

import os
import pandas as pd

from classification.utils.utils import load_risk_keywords, is_include_compound_words

root_dir = '../data/datasource/test'
# file_name = '项目风险模型数据集_总'
file_name = '去重_F_ZP_GP'

df = pd.read_excel(os.path.join(root_dir, 'input_file/{}.xlsx'.format(file_name)))

list_title = df['标题']
list_content = df['正文']

list_country = []
with open('../config/country.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
for line in lines:
    list_country.append(line.strip().split('(')[0].split('（')[0])

# 风险分类关键词
dict_risk_keywords = load_risk_keywords('../config/risk_keywords.xlsx')

list_bool_yiqing = []
list_bool_country = []
list_risk_key_words_category = []
for title, content in zip(list_title, list_content):
    if type(title) is float:
        title = ''
    if type(content) is float:
        content = ''
    # 国家识别筛选模型
    bool_country = False
    text = title + '。' + content[: len(content) // 5]
    for country in list_country:
        if country in text:
            bool_country = True
            list_bool_country.append('是')
            break
    if not bool_country:
        list_bool_country.append('否')

    text = title + '。' + content
    # 关键词: 疫情
    if '疫情' in text:
        list_bool_yiqing.append('是')
    else:
        list_bool_yiqing.append('否')
    # 风险关键词
    dict_risk_keywords_num = {
        risk_keywords_key: 0 for risk_keywords_key in dict_risk_keywords
    }
    bool_risk_keyword = False
    risk_category = '无风险'
    for risk_keywords_key in dict_risk_keywords_num:
        for risk_keyword in dict_risk_keywords[risk_keywords_key]:
            compound_words = risk_keyword.split('+')
            if is_include_compound_words(text=text, compound_words=compound_words):
                bool_risk_keyword = True
                dict_risk_keywords_num[risk_keywords_key] += 1

    if bool_risk_keyword:
        risk_category = max(dict_risk_keywords_num, key=dict_risk_keywords_num.get)

    list_risk_key_words_category.append(risk_category)


df['是否含"疫情"关键词'] = list_bool_yiqing
df['是否含一带一路相关国家'] = list_bool_country
df['关键词分类'] = list_risk_key_words_category

df.to_excel(os.path.join(root_dir, 'output_file/{}_result.xlsx'.format(file_name)))