package com.zzsn.search;


import com.zzsn.extrator.ExtType;
import com.zzsn.extrator.Processor;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileNotFoundException;
import java.io.Serializable;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * 基于标签的抽取实现(Jsoup实现)
 * 创建人：李东亮   
 * 创建时间：2015-5-17 下午3:07:01   
 * 公司 ：郑州数能软件科技有限公司
 * @version  1.0  t
 *
 */
public class JsoupTagProcessor implements Processor, Serializable {
    private static final Logger Log = LoggerFactory.getLogger(JsoupTagProcessor.class);
    /**
     * 
     */
    private static final long serialVersionUID = 1L;
    //定位标签
    private String[] nestTagsArray;
    //定位标签
    private String[] nestTags;
    //需要
    private List<String> endTagNames = new ArrayList<String>();
    private List<Map<String,String>> endTagAttrs = new ArrayList<Map<String,String>>();
    private String[] removeTags;
    private ExtType extType;
    private String attr;
    
    private Pattern regPattern = null;
    private boolean isReg = false;
    public JsoupTagProcessor(ExtType extType){
        this.extType = extType;
    }
    
    /**
     * 格式化attr
     * 创建人:  李东亮  
     * 创建时间:  2016-3-31 上午10:22:32 
     * @version 1.0
     * @param attr
     * @return
     */
    private String formatAttr(String attr){
        attr = attr.replaceAll("'|\"|‘|“|”", "");
        attr = attr.trim();
        attr = attr.replaceAll("[\\s|　]+", " ");
        return attr;
    }
    /**
     * 配置通配符
     * 创建人:  李东亮  
     * 创建时间:  2015-6-1 下午2:00:48 
     * @version 1.0
     * @param exp
     */
    private void configEndTag(String exp){
        String[] array = exp.split("\\.\\*\\.");
        if(array.length==1){
            return;
        }
        String firstTag;
        for(int i = 0 ; i <array.length;i++){
            if(i!=0){
                firstTag = array[i].split("\\.")[0];
                this.endTagNames.add(firstTag);
                this.endTagAttrs.add(this.getAttrs(firstTag));
            }
        }
    }
    
    /**
     * 获取正则
     * 创建人:  李东亮  
     * 创建时间:  2016-8-12 下午1:57:56 
     * @version 1.0
     * @param regExp
     * @return
     */
    private Pattern getRegPattern(String regExp){
        Pattern result = null;
        Pattern p = Pattern.compile("^(?i)REG\\[(.+)\\]$");
        Matcher m =  p.matcher(regExp);
        String exp = null;
        if(m.find()){
            exp = m.group(1);
            //如果不包含括号
            if(!exp.matches(".*\\(.+\\).*")){
                return result;
            }
            try{
            result = Pattern.compile(exp);
            }catch(Exception e){
                result = null;
                Log.error(regExp+"中括号中的内容不是有效的正则表达式");
            }
        }
        return result;
    }
    
    public DefaultMsg init() {
        // TODO Auto-generated method stub
       DefaultMsg dm = new DefaultMsg();
       /*  if(!exp.matches("^\\w+\\[\\d+\\](\\.\\w+\\[\\d+\\])*")){
            return null;
        }
        */
        String exp = extType.getExp();
        exp=exp.trim();
        if(exp.toUpperCase().startsWith("REG")){
            isReg = true;
            regPattern = this.getRegPattern(exp);
            return dm;
        }
        nestTagsArray= exp.split("\\|");
        if (nestTagsArray!=null&&nestTagsArray.length>0){
        	for (int i=0;i<nestTagsArray.length;i++) {
                if(nestTagsArray[i].startsWith("*")){
                	nestTagsArray[i]="HTML[1]."+nestTagsArray[i];
                }
                configEndTag(nestTagsArray[i]);
        	}
        	
        }
//        if(exp.startsWith("*")){
//            exp="HTML[1]."+exp;
//        }
//        configEndTag(exp);
//        nestTagsArray= exp.split("\\|");
        
       String subtraction = extType.getSubtraction();
       if(subtraction!=null&&subtraction.length()>0){
           removeTags = subtraction.split("\\,");
       }
       
       attr = extType.getAttr();
       return dm;
    }

    public ExtType getExtType() {
        // TODO Auto-generated method stub
        return this.extType;
    }
    
    public String test(String html,String charset) {
        Document node = Jsoup.parse(html, charset);
       
        //去除子标签
        if(removeTags!=null){
            for(String removeTag  : removeTags){
                List<Node> removeNodes = this.getRemoveTags(node, removeTag);
                if(removeNodes!=null){
                    for (Iterator<Node> iterator = removeNodes.iterator(); iterator.hasNext();) {
                        iterator.next().remove();
                    }
                }
            }}
        
        return node.outerHtml();
        }
    
    public String extract(String html) {
        Document doc = Jsoup.parse(html);
        doc.removeAttr("div[class=\"weui-mask js_mask\"]");
       //如果是exp表达式
        if(isReg){
            if(regPattern!=null){
                String result = "";
               Matcher matcher =  regPattern.matcher(doc.outerHtml());
               if(matcher.find()){
                   try{
                       result = matcher.group(1);
                   }catch(Exception e){
                       return result;
                   }
                   return result;
               }
            }
            return "";
        }
/***************获取标签开始********************/
        Node node=null;
        if (nestTagsArray!=null&&nestTagsArray.length>0){
        	for (int i=0;i<nestTagsArray.length;i++) {
        		nestTags = nestTagsArray[i].split("\\.");
                if(nestTags!=null){
                    node = resuChild(nestTags,doc,0,-1);
                }else
                {
                    node= doc;//全文
                }
                if(node!=null){
                	break;
                } 
        	}
        	if (node==null) {
        		return "";
        	}
        	
        } else {
        	return "";
        }


/***************获取标签结束********************/     

/***************后续处理开始********************/
        //获取属性值
        if(this.attr!=null){
            return node.attr(attr);
        }
        
       
        //去除子标签
      //  String result = node.outerHtml();
        if(removeTags!=null){
            for(String removeTag  : removeTags){
                List<Node> removeNodes = this.getRemoveTags(node, removeTag);
                if(removeNodes!=null){
                    for (Iterator<Node> iterator = removeNodes.iterator(); iterator.hasNext();) {
                     //   result.replace(iterator.next().outerHtml(), "");
                        iterator.next().remove();
                    }
                }
            }
        }
        
/***************后续处理结束********************/     
        return  node.outerHtml();
    }
    
    
    /**
     * 获取要去除的标签
     * 创建人:  李东亮  
     * 创建时间:  2015-5-29 下午4:02:54 
     * @version 1.0
     * @param node
     * @param removeTag
     * @return
     */
    private List<Node> getRemoveTags(Node node,String removeTag){
        return this.findNodes(node, removeTag, this.getAttrs(removeTag));
    }
    
    /**
     * 获取中括号中的所有属性
     * 创建人:  李东亮  
     * 创建时间:  2015-5-27 下午5:53:04 
     * @version 1.0
     * @param tagsItem
     * @return
     */
    public Map<String,String> getAttrs(String tagsItem){
        Map<String,String> map = new HashMap<String,String>();
        Pattern p = Pattern.compile("\\[.*?\\]");
        Matcher m = p.matcher(tagsItem);
        String[] array ; 
        String key;
        String value;
        while(m.find()){
            array = m.group().replace("[", "").replace("]","").split("=");
            if(array.length==2){
                key = array[0].trim();
                value = this.formatAttr(array[1]);
                if(key.length()>0&&value.length()>0){
                    map.put(key, value);
                }
            }
        }
        return map;
    }
    
    /**
     * 获取中括号中的所有属性
     * 创建人:  李东亮  
     * 创建时间:  2015-5-27 下午5:53:04 
     * @version 1.0
     * @param htmlTag
     * @return
     */
    public Integer getPos(String tagsItem){
        Integer pos = null;
        Pattern p = Pattern.compile("\\[\\d+\\]");
        Matcher m = p.matcher(tagsItem);
        while(m.find()){
            pos = Integer.valueOf(m.group().replace("[", "").replace("]",""));
            break;
        }
        return pos;
    }
    
    /**
     * 判断节点属性是否匹配,如果属性为空，默认匹配
     * 创建人:  李东亮  
     * 创建时间:  2015-5-27 下午6:23:52 
     * @version 1.0
     * @param tagNode
     * @param attrs
     * @return
     */
    private boolean match(Node tagNode,Map<String,String> attrs){
         boolean match = true;
         
         Set<String> keys = attrs.keySet();
         String key ; 
         String value;
         for (Iterator<String> iterator = keys.iterator(); iterator.hasNext();) {
             key =  iterator.next();
             value = tagNode.attr(key);
             if(value == null||!attrs.get(key).equals(value)){
                 match = false;
                 break;
             }
        }
         
        return match;
    }
    
    /**
     * 获取html标签名称
     * 创建人:  李东亮  
     * 创建时间:  2015-5-29 上午10:12:23 
     * @version 1.0
     * @param tagsItem
     * @return
     */
    private String getTagName(String tagsItem){
        String result;
        int index = tagsItem.indexOf("[");
        if(index == -1){
            result = tagsItem.toUpperCase();
        }else
        {
            result = tagsItem.substring(0,tagsItem.indexOf("[")).toUpperCase();
        }
        return  result;
    }
    
    /**
     * 匹配子元素
     * 创建人:  李东亮  
     * 创建时间:  2015-7-1 上午10:05:53 
     * @version 1.0
     * @param node
     * @param exp
     * @return
     */
    private Elements selectElement(Node node,String tagName,Map<String,String> attrs){
    
        Elements elements =new Elements();
        if(node instanceof Element){
            String attValue ;
             Element region = (Element)node;
             Elements children =  region.children();
             Set<String> keys  = attrs.keySet();
             boolean shouldAdd ;
              for(Element child : children){
                  shouldAdd = true;
                  if(child.tagName().toUpperCase().equals(tagName.toUpperCase())){
                      for(String key : keys){
                          if(!child.hasAttr(key)){
                              shouldAdd = false;
                              break;
                          }
                          //文档中找到的属性
                          String childAttr= this.formatAttr(child.attr(key));
                          attValue = attrs.get(key);
                          if(!childAttr.equals(attValue)){
                              shouldAdd = false;
                              break;
                          }
                      }
                  }else
                  {
                      shouldAdd  =false;
                  }
                
                  if(shouldAdd){
                      elements.add(child);
                  }
                 
              }
        }
        return elements;
    }
    /**
     * 查找parent元素的直接匹配子元素，直观上理解是距离top元素最近的匹配元素
     * 创建人:  李东亮  
     * 创建时间:  2015-6-29 下午6:23:28 
     * @version 1.0
     * @param parent
     * @param exp
     * @return
     */
    private Elements getDirectNodes(Node parent,String tagName,Map<String,String> attrs){
        Elements elements =  selectElement(parent,tagName,attrs);
        List<Node> children = parent.childNodes();
        if(elements ==null||elements.size()>0){
            return elements;
        }else if(children.size()==0){
            return null;
        }else
        {
            for(Iterator<Node> iterator = children.iterator();iterator.hasNext();){
                //此处递归
                Elements childrenList = getDirectNodes(iterator.next(),tagName,attrs);
                if(childrenList!=null&&childrenList.size()>0){
                    return childrenList;
                }
            }
        }
        return null;
    }
    /**
     * 查询匹配的子节点
     * 创建人:  李东亮  
     * 创建时间:  2015-6-27 下午3:59:00 
     * @version 1.0
     * @param parent
     * @param tag
     * @param attrs
     * @return
     */
    private List<Node> findNodes(Node parent,String tag,Map<String,String> attrs){
        List<Node> result = new ArrayList<Node>();
    /*  StringBuffer sb = new StringBuffer();
        Set<String> keys = attrs.keySet();
        String key;
        for (Iterator<String> iterator = keys.iterator(); iterator.hasNext();) {
            key = iterator.next();
            sb.append("["+key+"="+attrs.get(key)+"]");
        }*/
        
        String tagName = this.getTagName(tag);
        Elements es = getDirectNodes(parent,tagName,attrs);
        if(es==null){
            return result;
        }
        for(Iterator<Element> iterator = es.iterator();iterator.hasNext();){
            result.add(iterator.next());
        }
        
        //判断如果有序号
        Integer tagIndex = this.getPos(tag);
        if(tagIndex!=null&&es.size()>=tagIndex){
            List<Node> singleList = new ArrayList<Node>();
            singleList.add(es.get((tagIndex-1)<0?0:(tagIndex-1)));
            return singleList;
        }
        return result;
    }
    /**
     * 多级搜索
     * 创建人:  李东亮  
     * 创建时间:  2015-6-1 下午3:49:22 
     * @version 1.0
     * @param parent
     * @param endTagIndex
     * @param nestTagindex
     * @return
     */
    private Node resuMultiple(String[] nestTags,Node  parent,int endTagIndex,int nestTagindex){
        endTagIndex ++;
        if (endTagNames!=null && endTagNames.size()>0) {
        	for (int i=endTagIndex;i<endTagNames.size();i++) {
                List<Node> finds = this.findNodes(parent, endTagNames.get(i), endTagAttrs.get(i));
                if(finds.size()>0){
                    Node find = finds.get(0);
                    nestTagindex = nestTagindex + 2;
                    if(nestTagindex>=nestTags.length){//如果后面已经没有元素
                        return find;
                    }else
                    {
                        return  resuChild(nestTags,find,nestTagindex,endTagIndex);
                    }
                    
                }
        	}
        	
        }
        return null;
    }
    
    /**
     * 单级搜索
     * 创建人:  李东亮  
     * 创建时间:  2015-6-1 下午3:51:54 
     * @version 1.0
     * @param parent
     * @param endTagIndex
     * @param nestTagindex
     * @param nestTag
     * @return
     */
    private Node resuSingle(String[] nestTags,Node  parent,int endTagIndex,int nestTagindex,String nestTag ){
        String tagName = this.getTagName(nestTag);
        Integer pos = this.getPos(nestTag);
        Map<String,String> attrs = this.getAttrs(nestTag);
        Node node;
        Integer count = 0 ;
        List<Node> list = parent.childNodes();
        if(list==null){
            return null;
        }
        for (int i = 0; i < list.size(); i++) {    
             node =  list.get(i); 
             if(node.nodeName().toUpperCase().equals(tagName)){
                 count++;
                 if( ( pos==null||count.equals(pos) ) && this.match(node, attrs) ) {//如果匹配
                    if( nestTagindex!=nestTags.length-1){//没有到达最后位置继续判断下一个
                        nestTagindex ++;
                         return  resuChild(nestTags,node,nestTagindex,endTagIndex);
                     }else
                     {
                         return node;
                     }
                 }
                
             }
        }
        return null;
    }
    
    /**
     * 递归查询正文内容，一直迭代到查询到为止
     * 创建人:  李东亮  
     * 创建时间:  2015-5-18 上午11:49:55 
     * @version 1.0
     * @param tags
     * @param parent
     * @param index
     * @return
     */
    private Node resuChild(String[] nestTags,Node  parent,int nestTagindex,Integer endTagIndex ) {
        if(parent == null){
            return null;
        }
        String nestTag = nestTags[nestTagindex];
        if(nestTag.equals("*")){
            return this.resuMultiple(nestTags,parent, endTagIndex, nestTagindex);
        }
        else
        {
            return this.resuSingle(nestTags,parent, endTagIndex, nestTagindex, nestTag);
        }
    }

    public static void main(String[] args) throws FileNotFoundException  {
       
        
    }


    


}
