package com.zzsn.event.util;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.document.sentence.Sentence;
import com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord;
import com.hankcs.hanlp.corpus.document.sentence.word.IWord;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.utility.SentencesUtil;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;


public class CompanyUtil {


	private static	PerceptronLexicalAnalyzer analyzer = null;
	static {
		try {
			analyzer = new PerceptronLexicalAnalyzer(HanLP.Config.PerceptronCWSModelPath,
			         HanLP.Config.PerceptronPOSModelPath,
			         HanLP.Config.PerceptronNERModelPath);
/*			analyzer = new PerceptronLexicalAnalyzer("data/model/crf/pku199801/cws.txt.bin",
			         "data/model/crf/pku199801/pos.txt.bin",
			         "data/model/crf/pku199801/ner.txt.bin");*/
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	   /**
     * 识别命名实体
     * 创建人:  victory
     * 创建时间:  2016-6-8 下午4:30:35
     * @version 1.0
     * @param content
     * @return
     */
    public static List<String> entityRecognize(String content) {
        HashMap<String, List<String>> resultMap = new HashMap<String, List<String>>();
        List<String> sentenceList = SentencesUtil.toSentenceList(content);
        /**
         * 特征实体
         */
        List<String> orgs = new ArrayList<String>();   //组织机构
        //根据词性获取 人名、地域、组织机构等
        for (String sentence : sentenceList) {
            Sentence termList = analyzer.analyze(sentence); //调用hanlp算法进行分词
            List<Term> termList1 = HanLP.segment(sentence); //调用hanlp算法进行分词
//            CoreStopWordDictionary.apply(termList);
            for (IWord word : termList)

            {

                if (word instanceof CompoundWord)
                {
                	if (((CompoundWord) word).getLabel().equals("nt")) {
                		orgs.add(((CompoundWord) word).getValue());
//                		System.out.println("nt(组合):"+((CompoundWord) word).getValue());
                	}  else if (((CompoundWord) word).getLabel().equals("ntc")) {
                		orgs.add(((CompoundWord) word).getValue());
//                		System.out.println("ntc(组合):"+((CompoundWord) word).getValue());
                	}  else if (((CompoundWord) word).getLabel().equals("ntcf")) {
                		orgs.add(((CompoundWord) word).getValue());
//                		System.out.println("ntcf(组合):"+((CompoundWord) word).getValue());
                	}  else if (((CompoundWord) word).getLabel().equals("nto")) {
                		orgs.add(((CompoundWord) word).getValue());
//                		System.out.println("nto(组合):"+((CompoundWord) word).getValue());
                	}  else if (((CompoundWord) word).getLabel().equals("ntch")) {
                		orgs.add(((CompoundWord) word).getValue());
//                		System.out.println("ntch(组合):"+((CompoundWord) word).getValue());
                	}  else if (((CompoundWord) word).getLabel().equals("nth")) {
                		orgs.add(((CompoundWord) word).getValue());
//                		System.out.println("nth(组合):"+((CompoundWord) word).getValue());
                	}


                } else {
                	if (word.getLabel().equals("nt")) {
                		orgs.add(word.getValue());
//                		System.out.println("nt:"+word.getValue());
                	} else if (word.getLabel().equals("ntc")) {
                		orgs.add(word.getValue());
//                		System.out.println("ntc:"+word.getValue());
                	} else if (word.getLabel().equals("ntcf")) {
                		orgs.add(word.getValue());
//                		System.out.println("ntcf:"+word.getValue());
                	} else if (word.getLabel().equals("nto")) {
                		orgs.add(word.getValue());
//                		System.out.println("nto:"+word.getValue());
                	} else if (word.getLabel().equals("ntch")) {
                		orgs.add(word.getValue());
//                		System.out.println("ntch:"+word.getValue());
                	} else if (word.getLabel().equals("nth")) {
                		orgs.add(word.getValue());
//                		System.out.println("nth:"+word.getValue());
                	}
                }

            }

        }
		List<String> listorg = new ArrayList<String>();
		for (String temp : orgs) {
//			if (temp.contains("公司") || temp.contains("集团")) {
				if (temp != null && !temp.isEmpty()) {
					String tempwds = temp.trim();
					if (!listorg.contains(tempwds)) {
						listorg.add(tempwds);
					}
				}
			}

//		}

        return listorg;
    }


	   /**
  * 识别命名实体
  * 创建人:  victory
  * 创建时间:  2016-6-8 下午4:30:35
  * @version 1.0
  * @param content
  * @return
  */
 public static List<String> entityPersonRecognize(String content) {
     List<String> sentenceList = SentencesUtil.toSentenceList(content);
     /**
      * 特征实体
      */
     List<String> orgs = new ArrayList<String>();   //组织机构
     //根据词性获取 人名、地域、组织机构等
     for (String sentence : sentenceList) {
         Sentence termList = analyzer.analyze(sentence); //调用hanlp算法进行分词
//         List<Term> termList = HanLP.segment(sentence); //调用hanlp算法进行分词
//         CoreStopWordDictionary.apply(termList);
         for (IWord word : termList)

         {

             if (word instanceof CompoundWord)
             {
             	if (((CompoundWord) word).getLabel().equals("nr")) {
             		orgs.add(((CompoundWord) word).getValue());
             		System.out.println("nr(组合):"+((CompoundWord) word).getValue());
             	}  else if (((CompoundWord) word).getLabel().equals("nrj")) {
             		orgs.add(((CompoundWord) word).getValue());
             		System.out.println("nrj(组合):"+((CompoundWord) word).getValue());
             	}  else if (((CompoundWord) word).getLabel().equals("nr2")) {
             		orgs.add(((CompoundWord) word).getValue());
             		System.out.println("nr2(组合):"+((CompoundWord) word).getValue());
             	}  else if (((CompoundWord) word).getLabel().equals("nrf")) {
             		orgs.add(((CompoundWord) word).getValue());
             		System.out.println("nrf(组合):"+((CompoundWord) word).getValue());
             	}


             } else {
             	if (word.getLabel().equals("nr")) {
             		orgs.add(word.getValue());
             	} else if (word.getLabel().equals("nrj")) {
             		orgs.add(word.getValue());

             	} else if (word.getLabel().equals("nr2")) {
             		orgs.add(word.getValue());

             	} else if (word.getLabel().equals("nrf")) {
             		orgs.add(word.getValue());

             	}
             }

         }

     }
		List<String> listorg = new ArrayList<String>();
		for (String temp : orgs) {
				if (temp != null && !temp.isEmpty()) {
					String tempwds = temp.trim();
					if (!listorg.contains(tempwds)) {
						listorg.add(tempwds);
					}
				}
			}


     return listorg;
 }

 /**
* 识别命名实体
* 创建人:  victory
* 创建时间:  2016-6-8 下午4:30:35
* @version 1.0
* @param content
* @return
*/
public static List<String> entityPersonRecognizeOld(String content) {
/**
* 特征实体
*/
	if (null!=content && content.trim().length()>0) {
		content = content.replaceAll("丨", " ");
	}
    List<String> sentenceList = SentencesUtil.toSentenceList(content);
	List<String> names = new ArrayList<String>();  //人名
   //根据词性获取 人名
	for (String sentence : sentenceList) {
		   List<Term> termList = HanLP.segment(sentence);

		   for(Term term : termList) {
		       if (term.nature.equals(Nature.w)) {
		           continue;
		       }

		       switch (term.nature.toString())
		       {
		           case "nr":
		               names.add(term.word);
		               break;
		           case "nrj":
		               names.add(term.word);
		               break;
		           case "nr2":
		               names.add(term.word);
		               break;
		           case "nrf":
		               names.add(term.word);
		               break;
		       }
		   }
	}


	List<String> listnames = new ArrayList<String>();
	for (String temp : names) {
			if (temp != null && !temp.isEmpty()) {
				String tempwds = temp.trim();
				if (!listnames.contains(tempwds)) {
					listnames.add(tempwds);
				}
			}
		}


   return listnames;
}

/**
* 识别命名实体
* 创建人:  victory
* 创建时间:  2016-6-8 下午4:30:35
* @version 1.0
* @param content
* @return
*/
public static List<String> entityAdrssRecognize(String content) {
/**
* 特征实体
*/
	if (null!=content && content.trim().length()>0) {
		content = content.replaceAll("丨", " ");
	}
   List<String> sentenceList = SentencesUtil.toSentenceList(content);
	List<String> names = new ArrayList<String>();  //人名
  //根据词性获取 地名
	for (String sentence : sentenceList) {
		   List<Term> termList = HanLP.segment(sentence);

		   for(Term term : termList) {
		       if (term.nature.equals(Nature.w)) {
		           continue;
		       }

		       switch (term.nature.toString())
		       {
		           case "ns":
		               names.add(term.word);
		               break;
		           case "nsf":
		               names.add(term.word);
		               break;

		       }
		   }
	}


	List<String> listnames = new ArrayList<String>();
	for (String temp : names) {
			if (temp != null && !temp.isEmpty()) {
				String tempwds = temp.trim();
				if (!listnames.contains(tempwds)) {
					listnames.add(tempwds);
				}
			}
		}


  return listnames;
}


public static List<String> entityAll(String content) {
	List<String> result = new ArrayList<String>();

	//获取地名
	List<String> result1 = new ArrayList<String>();
	result1 = CompanyUtil.entityAdrssRecognize(content);

	//获取人名
	List<String> result2 = new ArrayList<String>();
	result2 = CompanyUtil.entityPersonRecognizeOld(content);

	//获取组织及公司名
//	List<String> result3 = new ArrayList<String>();
//	result3 = CompanyUtil.entityRecognize(content);

	result.addAll(result1);
	result.addAll(result2);
//	result.addAll(result3);

	return result;

}

	public static void main(String args[]){
//		String aa =  FileUtil.readFile(new File("D:/data/subject/111.txt"),"UTF-8");

		String aa = "三星从越南撤离返回中国";
		List<String> companylist = CompanyUtil.entityRecognize(aa);



		for (String aab :companylist) {
			System.out.println(aab);
		}


//		String str = "厂址紧靠地中海，位于土耳其阿达纳省尤穆塔勒克县，建设2×660MW超超临界火电发电机组，同步建设烟气脱硫和SCR脱硝装置";
//        String address = "";
//        String phone = "";
//        String name = "";
//        List<Term> terms = NLPTokenizer.segment(str);
//        System.out.println(terms);
//        for (Term term : terms) {
//            if (term.nature.startsWith("nr")){
//                //nr代表人名
//                name = term.word;
//                System.out.println("name: " + term.word);
//            }else if (term.nature.startsWith("m") && term.word.length() == 11){
//                //m代表数字
//                phone = term.word;
//                System.out.println("电话: " + term.word);
//            } else if (term.nature.startsWith("ns")) {
//                //nr代表地点
//            	address = term.word;
//                System.out.println("address: " + term.word);
//            }
//        }
//
//        //由于地址包含了数字，解析的时候数字成为单独的个体，与实际不符，所以通过差集求出地址
////        address = str.replace(phone, "").replace(name, "").trim();
//        System.out.println("address: " + address);


	}
}
