package com.zzsn.generation.segment;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreLabel;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

/**
 * 分词工具类 创建人：李东亮 创建时间：2015-6-15 下午6:04:38 公司 ：郑州数能软件科技有限公司
 * 
 * @version 1.0
 * 
 */
public class EnSegmentUtil {
    
    public static final String serializedClassifier = "data/classifiers/english.all.3class.distsim.crf.ser.gz";
    public static final AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier);

	/**
	 * 中文分词 创建人: 李东亮 创建时间: 2015-6-10 下午5:04:40
	 * 
	 * @version 1.0
	 * @param content
	 * @return
	 */
	public static HashMap<String, List<String>> entityRecognize(String content) {

		HashMap<String, List<String>> resultMap = new HashMap<String, List<String>>();
		if (null==content || content.trim().length()==0) {
			return resultMap;
		}
        /**
         * 特征实体 
         */
        List<String> names = new ArrayList<String>();  //人名
        List<String> places = new ArrayList<String>(); //地域
        List<String> orgs = new ArrayList<String>();   //组织机构

	    String rr=classifier.classifyWithInlineXML(content);
	       Document doc=Jsoup.parse(rr);
	      Elements namesnodes= doc.getElementsByTag("PERSON");
	      for (int i = 0; i < namesnodes.size(); i++) {
	    	  names.add(namesnodes.get(i).text());
		  }
	      Elements placesnodes= doc.getElementsByTag("LOCATION");
	      for (int i = 0; i < placesnodes.size(); i++) {
	    	  places.add(placesnodes.get(i).text());
		  }
	      Elements orgsnodes= doc.getElementsByTag("ORGANIZATION");
	      for (int i = 0; i < orgsnodes.size(); i++) {
	    	  orgs.add(orgsnodes.get(i).text());
		  }
	      resultMap.put(SegmentUtil.ENTITY_NAME, names);
	      resultMap.put(SegmentUtil.ENTITY_PLACE, places);
	      resultMap.put(SegmentUtil.ENTITY_ORG, orgs);
		return resultMap;
	}


    
	public static void main(String[] args) throws IOException {
	  	String serializedClassifier = "data/classifiers/english.all.3class.distsim.crf.ser.gz";
    	AbstractSequenceClassifier<CoreLabel> classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier);
    	//英文
        String s1 = "十日泰国游  America  Oregon Peleliu Mysterious bright blue light illuminates New York skyline after explosion Kayangel,中国 Lucy The next day is Friday Pedro Juan Caballero,Jeonnam,Río Negro,Busan,Daejeon,Villarrica,same as Paraguay Beijing  HongKong f California f Massachusetts  f Michigan  f Jefferson City fff Santa Fe, Salt Lake City";
     
       String rr=classifier.classifyWithInlineXML(s1);
       System.out.println(rr);
       String uu=classifier.apply(s1);
       
       Document doc=Jsoup.parse(rr);
      Elements nodes= doc.getElementsByTag("LOCATION");
      for (int i = 0; i < nodes.size(); i++) {
		System.out.println(nodes.get(i).text());
	}
	}
}
