package com.zzsn.generation.segment;

import com.zzsn.generation.test.IDFHash;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary;

import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.TraditionalChineseTokenizer;
import com.hankcs.hanlp.utility.SentencesUtil;


import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;

/**
 * 分词工具类 创建人：李东亮 创建时间：2015-6-15 下午6:04:38 公司 ：郑州数能软件科技有限公司
 * 
 * @version 1.0
 * 
 */
public class SegmentUtil {
    
    public static final String SEGMENT_WORD = "word";
    public static final String SEGMENT_NATURE = "nature";
    
    public static final String ENTITY_NAME = "name";
    public static final String ENTITY_PLACE = "place";
    public static final String ENTITY_ORG = "org";

	/**
	 * 中文分词 创建人: 李东亮 创建时间: 2015-6-10 下午5:04:40
	 * 
	 * @version 1.0
	 * @param content
	 * @return
	 */
//	public static List<String> segment(String content) {
//
//		List<String> result = new ArrayList<String>();
//		TokenStream tokenStream = null;
//		try {
//			tokenStream = AnalyzerBuilder.getInstance().tokenStream("content",
//					new StringReader(content));
//		} catch (IOException e1) {
//			// TODO Auto-generated catch block
//			e1.printStackTrace();
//		}
//		tokenStream.addAttribute(CharTermAttribute.class);
//		CharTermAttribute charTermAttribute;
//		try {
//			tokenStream.reset();
//			while (tokenStream.incrementToken()) {
//				charTermAttribute = tokenStream
//						.getAttribute(CharTermAttribute.class);
//				result.add(charTermAttribute.toString());
//			}
//		} catch (IOException e) {
//			// TODO Auto-generated catch block
//			e.printStackTrace();
//		} finally {
//			try {
//				tokenStream.close();
//			} catch (IOException e) {
//				// TODO Auto-generated catch block
//				e.printStackTrace();
//			}
//		}
//		return result;
//	}

	/**
	 * 净化分词结果 创建人: 李东亮 创建时间: 2015-7-15 下午4:14:23
	 * 
	 * @version 1.0
	 * @param result
	 * @return
	 */
	public static List<String> cleanResult(List<String> words) {
		List<String> result = new ArrayList<String>();
		String word;
		for (Iterator<String> iterator = words.iterator(); iterator.hasNext();) {
			word = iterator.next();
			// 去除停用词
			if (IDFHash.IsChiStopWord(word) || IDFHash.IsEngStopWord(word)) {
				continue;
			}
			// ?转换英文错误词
			result.add(word);
		}
		return result;
	}

	/**
	 * Hanlp算法分词，返回分词结果
	 * 创建人:  victory  
	 * 创建时间:  2016-3-28 下午2:15:53 
	 * @version 1.0
	 * @param content
	 * @return
	 */
	public static HashMap<String, List<String>> segmentHanlp(String content) {
	    
	    HashMap<String, List<String>> resultMap = new HashMap<String, List<String>>();
	    List<String> sentenceList = SentencesUtil.toSentenceList(content);
	    
	    //运用分词，来切分 分词结果 和词性
        List<String> wordList = new ArrayList<String>();
        List<String> natureList = new ArrayList<String>();
        Segment segment = HanLP.newSegment().enableOrganizationRecognize(true);
//        Segment segment = HanLP.newSegment("crf").enableOrganizationRecognize(true);
	    for (String sentence : sentenceList) {
	    	
	        List<Term> termList = segment.seg(sentence); //调用hanlp算法进行分词
	        CoreStopWordDictionary.apply(termList);
	        for(Term term : termList) {
	            if (term.nature.equals(Nature.w)) {
	                continue;
	            }
	            wordList.add(term.word);
	            natureList.add(term.nature.toString());
	        } 
	    }
	    resultMap.put(SegmentUtil.SEGMENT_WORD, wordList);
        resultMap.put(SegmentUtil.SEGMENT_NATURE, natureList);
        return resultMap;
	}
	
	/**
	 * Hanlp算法分词，返回分词结果
	 * 创建人:  victory  
	 * 创建时间:  2016-3-28 下午2:15:53 
	 * @version 1.0
	 * @param content
	 * @return
	 */
	public static HashMap<String, List<String>> segmentHanlpForTw(String content) {
	    
	    HashMap<String, List<String>> resultMap = new HashMap<String, List<String>>();
	    TraditionalChineseTokenizer.segment(content);
	    List<String> sentenceList = SentencesUtil.toSentenceList(content);
	    
	    //运用分词，来切分 分词结果 和词性
        List<String> wordList = new ArrayList<String>();
        List<String> natureList = new ArrayList<String>();
	    for (String sentence : sentenceList) {
	        List<Term> termList = TraditionalChineseTokenizer.segment(sentence);; //繁体字分词
	        //CoreStopWordDictionary.apply(termList);
	        for(Term term : termList) {
	            if (term.nature.equals(Nature.w)) {
	                continue;
	            }
	            wordList.add(term.word);
	            natureList.add(term.nature.toString());
	        } 
	    }
	    resultMap.put(SegmentUtil.SEGMENT_WORD, wordList);
        resultMap.put(SegmentUtil.SEGMENT_NATURE, natureList);
        return resultMap;
	}
    /**
     * Hanlp算法分词，返回分词结果，并使用停用词进行过滤
     * 创建人:  victory  
     * 创建时间:  2016-3-28 下午2:15:53 
     * @version 1.0
     * @param content
     * @return
     */
    public static HashMap<String, List<String>> segmentCleanHanlp(String content) {
        
        HashMap<String, List<String>> resultMap = new HashMap<String, List<String>>();
        List<String> sentenceList = SentencesUtil.toSentenceList(content);
        
        //运用分词，来切分 分词结果 和词性
        List<String> wordList = new ArrayList<String>();
        List<String> natureList = new ArrayList<String>();
        for (String sentence : sentenceList) {
            List<Term> termList = HanLP.segment(sentence); //调用hanlp算法进行分词
            CoreStopWordDictionary.apply(termList);
            for(Term term : termList) {
                if (term.nature.equals(Nature.w)) {
                    continue;
                }
                wordList.add(term.word);
                natureList.add(term.nature.toString());
            } 
        }
        resultMap.put(SegmentUtil.SEGMENT_WORD, wordList);
        resultMap.put(SegmentUtil.SEGMENT_NATURE, natureList);
        return resultMap;
    }
    
    /**
     * 识别地域
     * 创建人:  victory  
     * 创建时间:  2016-6-8 下午4:27:24 
     * @version 1.0
     * @param wordsMap
     * @return
     */
    public static List<String> nameRecognize(HashMap<String, List<String>> wordsMap) {
        //运用分词，来切分 分词结果 和词性
        List<String> names = new ArrayList<String>();
        if(!wordsMap.isEmpty()) {
            List<String> wordList = wordsMap.get(SegmentUtil.SEGMENT_WORD);
            List<String> natureList = wordsMap.get(SegmentUtil.SEGMENT_NATURE);
            int count = 0;
            for (String nature : natureList) {
                if (nature.equals("nr")) {
                    names.add(wordList.get(count));
                }
                count ++;
            }
        }
        return names;
    }
    /**
     * 名称识别
     * 创建人:  victory  
     * 创建时间:  2016-6-8 下午4:29:54 
     * @version 1.0
     * @param content
     * @return
     */
    public static List<String> nameRecognize(String content) {
        List<String> sentenceList = SentencesUtil.toSentenceList(content);
        List<String> names = new ArrayList<String>();
        
        for (String sentence : sentenceList) {
            List<Term> termList = HanLP.segment(sentence); //调用hanlp算法进行分词
            CoreStopWordDictionary.apply(termList);
            for(Term term : termList) {
                if (term.nature.equals(Nature.w)) {
                    continue;
                }
                if (term.nature.equals(Nature.nr)) {
                    names.add(term.word);
                }
            } 
        }
        return names;
    }
    
    /**
     * 识别事件，文本中的所有事件
     * 创建人:  victory  
     * 创建时间:  2016-6-12 下午5:22:37 
     * @version 1.0
     * @param content
     * @return
     */
    public static List<String> eventRecognize(String content) {
       List<String> phraseList = HanLP.extractPhrase(content, 5);
        return phraseList;
    }
    
    /**
     * 识别文本中的所有时间
     * 创建人:  victory  
     * 创建时间:  2016-6-12 下午5:23:00 
     * @version 1.0
     * @param content
     * @return
     */
    public static List<String> dateRecognize(String content) {
        List<String> sentenceList = SentencesUtil.toSentenceList(content);
        List<String> names = new ArrayList<String>();
        
        for (String sentence : sentenceList) {
            List<Term> termList = HanLP.segment(sentence); //调用hanlp算法进行分词
            CoreStopWordDictionary.apply(termList);
            for(Term term : termList) {
                if (term.nature.equals(Nature.w)) {
                    continue;
                }
                if (term.nature.equals(Nature.nr)) {
                    names.add(term.word);
                }
            } 
        }
        return names;
    }
    
    /**
     * 识别命名实体
     * 创建人:  victory  
     * 创建时间:  2016-6-8 下午4:30:35 
     * @version 1.0
     * @param content
     * @return
     */
//    public static HashMap<String, List<String>> entityRecognize(String content) {
//        HashMap<String, List<String>> resultMap = new HashMap<String, List<String>>();
//        List<String> sentenceList = SentencesUtil.toSentenceList(content);
//	     PerceptronLexicalAnalyzer analyzer = null;
//			try {
//				analyzer = new PerceptronLexicalAnalyzer("data/model/perceptron/pku199801/cws.bin",
//				         HanLP.Config.PerceptronPOSModelPath,
//				         HanLP.Config.PerceptronNERModelPath);
//			} catch (IOException e) {
//				// TODO Auto-generated catch block
//				e.printStackTrace();
//			}
//        /**
//         * 特征实体
//         */
//        List<String> names = new ArrayList<String>();  //人名
//        List<String> places = new ArrayList<String>(); //地域
//        List<String> orgs = new ArrayList<String>();   //组织机构
//        //根据词性获取 人名、地域、组织机构等
//        for (String sentence : sentenceList) {
////            Sentence termList = analyzer.analyze(sentence); //调用hanlp算法进行分词
//            List<Term> termList = HanLP.segment(sentence); //调用hanlp算法进行分词
////            CoreStopWordDictionary.apply(termList);
////            for (IWord word : termList)
////
////            {
////
////                if (word instanceof CompoundWord)
////                {
/////*                	if (((CompoundWord) word).getLabel().equals("nt")) {
////                		System.out.println("nt(组合):"+((CompoundWord) word).getValue());
////                	}  else if (((CompoundWord) word).getLabel().equals("ntc")) {
////                		System.out.println("ntc(组合):"+((CompoundWord) word).getValue());
////                	}  else if (((CompoundWord) word).getLabel().equals("ntcf")) {
////                		System.out.println("ntcf(组合):"+((CompoundWord) word).getValue());
////                	}  else if (((CompoundWord) word).getLabel().equals("nto")) {
////                		System.out.println("nto(组合):"+((CompoundWord) word).getValue());
////                	}  else if (((CompoundWord) word).getLabel().equals("ntch")) {
////                		System.out.println("ntch(组合):"+((CompoundWord) word).getValue());
////                	}  else if (((CompoundWord) word).getLabel().equals("nth")) {
////                		System.out.println("nth(组合):"+((CompoundWord) word).getValue());
////                	}*/
////                	if (((CompoundWord) word).getLabel().equals("nr")) {
////                		System.out.println("nr(组合):"+((CompoundWord) word).getValue());
////                	}  else if (((CompoundWord) word).getLabel().equals("ns")) {
////                		System.out.println("ns(组合):"+((CompoundWord) word).getValue());
////                	}
////
////                } else {
////                	if (word.getLabel().equals("nt")) {
////                		orgs.add(word.getValue());
////                		System.out.println("nt:"+word.getValue());
////                	} else if (word.getLabel().equals("ntc")) {
////                		orgs.add(word.getValue());
////                		System.out.println("ntc:"+word.getValue());
////                	} else if (word.getLabel().equals("ntcf")) {
////                		orgs.add(word.getValue());
////                		System.out.println("ntcf:"+word.getValue());
////                	} else if (word.getLabel().equals("nto")) {
////                		orgs.add(word.getValue());
////                		System.out.println("nto:"+word.getValue());
////                	} else if (word.getLabel().equals("ntch")) {
////                		orgs.add(word.getValue());
////                		System.out.println("ntch:"+word.getValue());
////                	} else if (word.getLabel().equals("nth")) {
////                		orgs.add(word.getValue());
////                		System.out.println("nth:"+word.getValue());
////                	}
////                	if (word.getLabel().equals("nr")) {
////                		names.add(word.getValue());
//////                		System.out.println("nr:"+word.getValue());
////                	} else if (word.getLabel().equals("nrj")) {
////                		names.add(word.getValue());
////                		System.out.println("nrj:"+word.getValue());
////                	} else if (word.getLabel().equals("nr2")) {
////                		names.add(word.getValue());
////                		System.out.println("nr2:"+word.getValue());
////                	} else if (word.getLabel().equals("nrf")) {
////                		names.add(word.getValue());
////                		System.out.println("nrf:"+word.getValue());
////                	}
////                	else if (word.getLabel().equals("ns")) {
////                		places.add(word.getValue());
////                		System.out.println("ns:"+word.getValue());
////                	} else if (word.getLabel().equals("nsf")) {
////                		places.add(word.getValue());
////                		System.out.println("nsf:"+word.getValue());
////                	}
////                }
////
////            }
//            for(Term term : termList) {
//                if (term.nature.equals(Nature.w)) {
//                    continue;
//                }
//
//                switch (term.nature.toString())
//                {
//                    case "nr":
//                        names.add(term.word);
//                        break;
//                    case "nrj":
//                        names.add(term.word);
//                        break;
//                    case "nr2":
//                        names.add(term.word);
//                        break;
//                    case "nrf":
//                        names.add(term.word);
//                        break;
//                    case "ns":
//                        places.add(term.word);
//                        break;
//                    case "nsf":
//                        places.add(term.word);
//                        break;
//                    case "nt":
//                        orgs.add(term.word);
//                        break;
//                    case "ntc":
//                        orgs.add(term.word);
//                        break;
//                    case "ntcf":
//                        orgs.add(term.word);
//                        break;
//                    case "nto" :
//                        orgs.add(term.word);
//                        break;
//                    case "ntch":
//                        orgs.add(term.word);
//                        break;
//                    case "nth":
//                        orgs.add(term.word);
//                        break;
//                }
//            }
//        }
//
//        for (int i = 0; i < places.size(); i++) {
//			String place =places.get(i);
//
//			if("孟加拉".equals(place)) {
//				places.set(i, "孟加拉国");
//			}
//		}
//
//        resultMap.put(SegmentUtil.ENTITY_NAME, names);
//        resultMap.put(SegmentUtil.ENTITY_PLACE, places);
//        resultMap.put(SegmentUtil.ENTITY_ORG, orgs);
//        return resultMap;
//    }
    
    
    /**
     * 英文分词
     * step1 英文词法分析，去除数字、连字符、标点符号、特殊字符
     * step2 去停用词
     * step3 词干提取
     * @param content 文本内容
     * @return 
     */
//    public String englishSegment(String content){
//    	Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_48);
//        StringBuilder sb=new StringBuilder();
//        try {
//            TokenStream tokenStream = analyzer.tokenStream(null, content);
//            //设置波特词干提取器，自动去除停用词
//            tokenStream=new PorterStemFilter(tokenStream);
//            OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
//            CharTermAttribute term=tokenStream.addAttribute(CharTermAttribute.class);
//            tokenStream.reset();//重置
//            while( tokenStream.incrementToken() ){
//                int startOffset = offsetAttribute.startOffset();
//                int endOffset = offsetAttribute.endOffset();
//                //去除数字
//                String word=term.toString().replaceAll("[^a-zA-Z]", "");
//                if(word.length()!=0)
//                    sb.append(word+" ");
//            }
//            tokenStream.end();
//            tokenStream.close();
//        } catch (IOException ex) {
////            Logger.getLogger(EnglishSegment.class.getName()).log(Level.SEVERE, null, ex);
//        }
//        return sb.toString();
//    }
//
////    public void setAnalyzer(Analyzer analyzer) {
////        this.analyzer = analyzer;
////    }
    
//	public static void main(String[] args) throws IOException {
//		HashMap<String, List<String>> entityMap =  SegmentUtil.entityRecognize("重庆市武隆区人民政府办公室关于印发武隆区粮食应急预案的通知");
//		List<String> places = entityMap.get(SegmentUtil.ENTITY_PLACE);
//
//		String xx="会议强调，要进一步推进国家药品集中采购试点、短缺药监测应对和医疗救助工作。一要完善集中采购制度，加强中标药品质量监管和供应保障，实现降价惠民。认真总结试点经验，及时全面推开。二要保障基本药物、急（抢）救等药品供应。完善监测预警机制，对临床必需、易短缺、替代性差等药品，采取强化储备、统一采购或定点生产等方式保供，防止急需、常用药品不合理涨价。三要在做好基本医疗保障的同时，进一步完善医疗救助制度，落实落细参保缴费资助、直接救助等措施，切实提高困难群众获得感。";
////		List<WordInfo> ad =	 HanLP.extractWords(xx, 5);
////		List<String> ad1 =	 HanLP.extractKeyword(xx, 5);
////		List<String> ad2 =	HanLP.extractPhrase(xx, 5);
////		NewWordDiscover discover = new NewWordDiscover(4, 0.0f, 0.5f, 200f, false);
//////		NewWordDiscover discover = new NewWordDiscover();
////		List<WordInfo> ad12 = discover.discover(xx, 5);
////	      String content="My mother always says that I am a naughty girl. When I go out with her, I always make some mistakes. Such as annoying others when they are talking, which is very impolite. My mother doesn’t punish me, instead, she tries to disturb me when I want to talk to her. It makes me realize how rude I am. My mother is so tolerant that she is a good mother. ";
//////	      SegmentUtil es=new SegmentUtil();
//////	        String result=es.englishSegment(content);
//////	        System.out.println(result);
////
//////	    String content = "德国马勒贝洱集团";
////	    content = content.toLowerCase();
////	//    List<String> sentenceList = SentencesUtil.toSentenceList(content);
//////	    List<Term> ad = HanLP.segment(content);
////	    System.out.println(ad);
//
//
////		List<Term> termList = HanLP.segment("毕业生");
////		System.out.println(termList.size());
////
//
//NewWordDiscover discover = new NewWordDiscover(5, 0.0f, 0.5f, 100f, true);
//
////读取文件夹下所以文档并合并成一篇文档用于新词发现
//
//StringBuilder sbText = new StringBuilder();
//
//    File[] txtFiles = new File("E:\\dataword").listFiles();
//
//    int i = 0;
//
//    for (File file : txtFiles)
//
//    {
//
//        System.out.printf("[%d / %d] 读取 %s 中...\n", ++i, txtFiles.length, file.getName());
//
//        sbText.append(IOUtil.readTxt(file.getPath()));
//
//        if (i == 100) break;
//
//    }
//
//    System.out.printf("对长度为%d的语料进行分析中...\n", sbText.length());
//
//    List<WordInfo> wordInfoList = discover.discover(xx, 50);
//    List<String> ss = HanLP.extractPhrase(xx, 15);
//
//
//    //打印出发现的新词
//    System.out.println(ss);
//    for (WordInfo wordInfo : wordInfoList) {
//
//     System.out.println(wordInfo.text);
//}
//	}
}
