package com.zzsn.event.util;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.mining.word.TfIdfCounter;
import com.hankcs.hanlp.seg.common.Term;
import org.springframework.util.StringUtils;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * @author kongliufeng
 * @Description TODO: hanlp分词工具
 * @create 2020-08-31 18:50
 * @Version 1.0
 */
public class HanlpUtil {

    /**
     * @Description 单文本关键词, 基于hanlp中短语抽取算法extractPhrase
     * @author kongliufeng
     * @创建时间 2020/9/4 17:59
     * @Version 1.0
     */
    public static List<Map.Entry<String, Integer>> getKeywordListByPhrase(String text, int limitNo) {
        if (StringUtils.isEmpty(text)) {
            return null;
        }
        int keySize = text.length() * 3 / 100;
        List<String> repeatList = HanLP.extractPhrase(text, keySize);
        if (repeatList != null && repeatList.size() > 0) {
            //移除停用词
            StopWordsUtil.removeStopWords(repeatList);
            Map<String, Integer> map = StringUtil.getHitWordsAndTimes(repeatList, text);
            //根据频次排序
            List<Map.Entry<String, Integer>> list = SortUtil.sortMap(map);
            if (limitNo > list.size()) {
                return list;
            } else {
                return list.subList(0, limitNo);
            }
        }
        return null;
    }

    /**
     * @param text      文本
     * @param limitNo   提取个数
     * @param blackList 黑名单
     * @Description 单文本关键词, 基于hanlp中短语抽取算法extractPhrase 移除blackList中的关键词
     * @author yanxin
     * @创建时间 2022年6月21日14:19:21
     * @Version 1.0
     */
    public static List<Map.Entry<String, Integer>> getKeywordListExclude(String text, int limitNo, List<String> blackList) {
        if (StringUtils.isEmpty(text)) {
            return null;
        }
        int keySize = text.length() * 3 / 100;
        List<String> repeatList = HanLP.extractPhrase(text, keySize);
        if (repeatList != null && repeatList.size() > 0) {
            //移除停用词
            if (repeatList != null) {
                Iterator<String> iterator = repeatList.iterator();
                while (iterator.hasNext()) {
                    if (blackList.contains(iterator.next())) {
                        iterator.remove();
                    }
                }
            }
            //StopWordsUtil.removeStopWords(repeatList);
            Map<String, Integer> map = StringUtil.getHitWordsAndTimes(repeatList, text);
            //根据频次排序
            List<Map.Entry<String, Integer>> list = SortUtil.sortMap(map);
            if (limitNo > list.size()) {
                return list;
            } else {
                return list.subList(0, limitNo);
            }
        }
        return null;
    }

    /**
     * @Description 多文本关键词, 基于hanlp中短语抽取算法extractPhrase
     * @author kongliufeng
     * @创建时间 2020/9/4 17:59
     * @Version 1.0
     */
    public static List<Map.Entry<String, Integer>> getKeywordListByPhrase(Collection<String> texts, int limitNo) {
        Map<String, Integer> result = new HashMap<String, Integer>();
        for (String text : texts) {
            if (StringUtils.isEmpty(text)) {
                return null;
            }
            int keySize = text.length() * 3 / 100;
            List<String> repeatList = HanLP.extractPhrase(text, keySize);
            if (repeatList != null && repeatList.size() > 0) {
                //移除停用词
                StopWordsUtil.removeStopWords(repeatList);
                Map<String, Integer> map = StringUtil.getHitWordsAndTimes(repeatList, text);
                for (Map.Entry<String, Integer> entry : map.entrySet()) {
                    Integer f = result.get(entry.getKey());
                    if (f == null) {
                        result.put(entry.getKey(), entry.getValue());
                    } else {
                        result.put(entry.getKey(), f + entry.getValue());
                    }
                }
            }
        }
        //根据频次排序
        List<Map.Entry<String, Integer>> list = SortUtil.sortMap(result);
        if (limitNo > list.size()) {
            return list;
        } else {
            return list.subList(0, limitNo);
        }
    }

    public static List<Map.Entry<String, Integer>> getKeywordListByPhraseExclude(Collection<String> texts, int limitNo, String excludeWord) {
        Map<String, Integer> result = new HashMap<String, Integer>();
        for (String text : texts) {
            if (StringUtils.isEmpty(text)) {
                return null;
            }
            int keySize = text.length() * 3 / 100;
            List<String> repeatList = HanLP.extractPhrase(text, keySize);
            if (repeatList != null && repeatList.size() > 0) {
                //移除停用词
                StopWordsUtil.removeStopWords(repeatList, excludeWord);
                Map<String, Integer> map = StringUtil.getHitWordsAndTimes(repeatList, text);
                for (Map.Entry<String, Integer> entry : map.entrySet()) {
                    Integer f = result.get(entry.getKey());
                    if (f == null) {
                        result.put(entry.getKey(), entry.getValue());
                    } else {
                        result.put(entry.getKey(), f + entry.getValue());
                    }
                }
            }
        }
        //根据频次排序
        List<Map.Entry<String, Integer>> list = SortUtil.sortMap(result);
        if (limitNo > list.size()) {
            return list;
        } else {
            return list.subList(0, limitNo);
        }
    }

    /**
     * @Description 单文本抽取关键词, 基于hanlp中的textRank算法
     * @author kongliufeng
     * @创建时间 2020/9/4 17:57
     * @Version 1.0
     */
    public static List<Map.Entry<String, Integer>> getKeywordListByTextRank(String text, int limitNo) {
        if (StringUtils.isEmpty(text)) {
            return null;
        }
        int keySize = text.length() * 3 / 100;
        List<String> repeatList = HanLP.extractKeyword(text, keySize);
        if (repeatList != null && repeatList.size() > 0) {
            //移除停用词
            StopWordsUtil.removeStopWords(repeatList);
            Map<String, Integer> map = StringUtil.getHitWordsAndTimes(repeatList, text);
            //根据频次排序
            List<Map.Entry<String, Integer>> list = SortUtil.sortMap(map);
            if (limitNo > list.size()) {
                return list;
            } else {
                return list.subList(0, limitNo);
            }
        }
        return null;
    }

    /**
     * @Description 多文本抽取关键词, 基于hanlp中的textRank算法
     * @author kongliufeng
     * @创建时间 2020/9/4 18:08
     * @Version 1.0
     */
    public static List<Map.Entry<String, Integer>> getKeywordListByTextRank(Collection<String> texts, int limitNo) {
        Map<String, Integer> result = new HashMap<String, Integer>();
        for (String text : texts) {
            if (StringUtils.isEmpty(text)) {
                return null;
            }
            int keySize = text.length() * 3 / 100;
            List<String> repeatList = HanLP.extractKeyword(text, keySize);
            if (repeatList != null && repeatList.size() > 0) {
                //移除停用词
                StopWordsUtil.removeStopWords(repeatList);
                Map<String, Integer> map = StringUtil.getHitWordsAndTimes(repeatList, text);
                for (Map.Entry<String, Integer> entry : map.entrySet()) {
                    Integer f = result.get(entry.getKey());
                    if (f == null) {
                        result.put(entry.getKey(), entry.getValue());
                    } else {
                        result.put(entry.getKey(), f + entry.getValue());
                    }
                }
            }
        }
        //根据频次排序
        List<Map.Entry<String, Integer>> list = SortUtil.sortMap(result);
        if (limitNo > list.size()) {
            return list;
        } else {
            return list.subList(0, limitNo);
        }
    }

    /**
     * @Description
     * @author kongliufeng
     * @创建时间 2020/9/4 18:15
     * @Version 1.0
     */
    public static List<Map.Entry<String, Integer>> getKeywordListByTFIDF(String text, int limitNo) {
        TfIdfCounter tfIdfCounter = new TfIdfCounter();
        tfIdfCounter.add(text);
        tfIdfCounter.compute();
        List<Map.Entry<String, Integer>> entries = tfIdfCounter.sortedAllTfInt();
        List<Map.Entry<String, Integer>> list = new ArrayList<>(limitNo);
        for (Map.Entry<String, Integer> entry : entries) {
            if (!StopWordsUtil.isStopWord(entry.getKey())) {
                list.add(entry);
            }
            if (list.size() == limitNo) {
                break;
            }
        }
        return list;
    }

    /**
     * @Description
     * @author kongliufeng
     * @创建时间 2020/9/4 18:14
     * @Version 1.0
     */
    public static List<Map.Entry<String, Integer>> getKeywordListByTFIDF(Collection<String> texts, int limitNo) {
        List<Map.Entry<String, Integer>> list = new ArrayList<>(limitNo);
        TfIdfCounter tfIdfCounter = new TfIdfCounter();
        for (String text : texts) {
            tfIdfCounter.add(text);

        }
        tfIdfCounter.compute();
        List<Map.Entry<String, Integer>> entries = tfIdfCounter.sortedAllTfInt();
        for (Map.Entry<String, Integer> entry : entries) {
            if (!StopWordsUtil.isStopWord(entry.getKey())) {
                list.add(entry);
            }
            if (list.size() == limitNo) {
                break;
            }
        }
        return list;
    }

    /**
     * 但文本抽取关键词
     *
     * @param text    文本
     * @param limitNo 热词数
     * @author lkg
     * @date 2024/1/9
     */
    public static List<Map.Entry<String, Integer>> extractKeyWordsByText(String text, int limitNo) {
        if (StringUtils.isEmpty(text)) {
            return null;
        }
        //每100字最多提取30个词
        int keySize = text.length() * 30 / 100;
        List<String> phraseList = HanLP.extractKeyword(text, keySize);
        //获取短语前100个
        if (phraseList != null) {
            Iterator<String> iterator = phraseList.iterator();
            while (iterator.hasNext()) {
                String strs = iterator.next();
                if (strs.length() < 3 || !isChinese(strs)) {
                    //排除长度小于3的 和非中文关键词
                    iterator.remove();
                    continue;
                }
                //如果是人名、地名、组织名 则去除
                //获取人名、地名、组织名
                List<String> resul = CompanyUtil.entityAll(strs);
                if (resul.size() > 0) {
                    iterator.remove();
                    continue;
                }
                //判断词性，仅保留词性为gi和词性中包含n的词
                List<Term> termList = HanLP.segment(strs); //调用hanlp算法进行分词
                if (termList != null && termList.size() == 1 && termList.get(0).nature != null) {
                    String nature = termList.get(0).nature.toString();
                    if (nature == null || (!nature.contains("n") && !nature.equals("gi"))) {
                        iterator.remove();
                    }
                }
            }
            //去重
            List<String> distinctList = phraseList.stream().distinct().collect(Collectors.toList());
            Map<String, Integer> map = StringUtil.getHitWordsAndTimes(distinctList, text);
            //根据频次排序
            List<Map.Entry<String, Integer>> list = SortUtil.sortMap(map);
            if (limitNo > list.size()) {
                return list;
            } else {
                return list.subList(0, limitNo);
            }
        }
        return null;
    }

    /*
     * 判断是否为中文
     */
    public static boolean isChinese(String str) {
        if (oConvertUtils.isEmpty(str)) {
            return false;
        }
        String regEx = "[\\u4e00-\\u9fa5]+";
        Pattern p = Pattern.compile(regEx);
        Matcher m = p.matcher(str);
        if (m.find()) {
            return true;
        }
        return false;
    }


    /**
     * 单文本抽取关键词
     *
     * @param text    文本
     * @param limitNo 热词数
     * @author lkg
     * @date 2024/1/9
     */
    public static List<Map.Entry<String, Integer>> extractKeyWordsByText(String text, int limitNo, List<String> blackList, List<String> whiteList) {
        if (StringUtils.isEmpty(text)) {
            return null;
        }
        //每100字最多提取30个词
        int keySize = text.length() * 30 / 100;
        //List<String> phraseList = HanLP.extractPhrase(text, keySize);
        List<String> phraseList = HanLP.extractKeyword(text, keySize);
        //获取短语前100个
        if (phraseList != null) {
            Iterator<String> iterator = phraseList.iterator();
            while (iterator.hasNext()) {
                String strs = iterator.next();
                //移除停用词
                if (blackList.contains(strs)) {
                    iterator.remove();
                    continue;
                }
                if (strs.length()<2 || !isChinese(strs)) {
                    //排除长度小于2的 和非中文关键词
                    iterator.remove();
                    continue;
                }
                //如果是人名、地名、组织名 则去除
                //获取人名、地名、组织名
                List<String> resul = CompanyUtil.entityAll(strs);
                if (null!=resul && resul.size()>0) {
                    iterator.remove();
                    continue;
                }
                //判断词性，仅保留词性为gi和词性中包含n的词
                List<Term> termList = HanLP.segment(strs); //调用hanlp算法进行分词
                if(termList!=null && termList.size()==1 && termList.get(0).nature!=null){
                    String nature = termList.get(0).nature.toString();
                    //判断词性，词长度>2时，仅保留词性为gi和词性中包含n的词；词长度=2时，只保留nz属性的
                    if(strs.length() == 2){
                        if(nature==null || !nature.equals("nz")){
                            iterator.remove();
                            continue;
                        }
                    }else{
                        if(nature==null || (!nature.contains("n") && !nature.equals("gi"))){
                            iterator.remove();
                            continue;
                        }
                    }
                }
            }
            //合并白名单词
            phraseList.addAll(whiteList);
            //去重
            List<String> distinctList = phraseList.stream().distinct().collect(Collectors.toList());
            Map<String, Integer> map = StringUtil.getHitWordsAndTimes(distinctList, text);
            //根据频次排序
            List<Map.Entry<String, Integer>> list = SortUtil.sortMap(map);
            if (limitNo > list.size()) {
                return list;
            } else {
                return list.subList(0, limitNo);
            }
        }
        return null;
    }
}
