package com.zzsn.extrator;

import java.io.InputStream;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


import com.zzsn.util.CharsetUtil;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import com.zzsn.util.FileUtil;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * 正文抽取处理类.抽取标题，摘要，正文，作者，字符集
 * 创建人：李东亮   
 * 创建时间：2015-5-11 下午3:28:12   
 * 公司 ：郑州数能软件科技有限公司
 * @version  1.0  
 *
 */
public class WebExtractorImplforweixin implements Extractor {
    private static final Logger Log = LoggerFactory.getLogger(WebExtractorImplforweixin.class);
    //抽取类型
    public enum EXT_TYPE {
        CONTENT, TITLE, KEYWORDS, SUMMARY, AUTHOR, PUBLISH_DATE, ORIGIN
    }
    private List<Processor> processors;
    private HttpResponse getMethod;
    private String html;
    private String charset;
    public WebExtractorImplforweixin(List<Processor> processors,HttpResponse getMethod){
        this.processors = processors;
        this.getMethod = getMethod;
    }
    public WebExtractorImplforweixin(List<Processor> processors,String html){
        this.processors = processors;
        this.html = html;
    }
    /**
     * 从inputstream中读取内容
     * 创建人:  李东亮  
     * 创建时间:  2016-8-25 下午4:17:02 
     * @version 1.0
     * @return
     * @throws Exception
     */
    public  boolean readEntity(String url,InputStream inputStream) {
        try{
            html = FileUtil.readHtml(inputStream, Constants.READ_CHARSET);
            Header header = getMethod.getFirstHeader("Content-Type");
            charset = CharsetUtil.getCharset(html,header);
            html = CharsetUtil.convertCorrectCharset(html,charset);
         
            Document jsoupDoc = Jsoup.parse(html);
            html = jsoupDoc.html();
        }catch(Exception e){
             return false;
         }
         if(html==null){
             return false;
         }else{
             return true;
         }
    }
    

    /**
     * 获取带标签正文
     * 创建人:  李东亮  
     * 创建时间:  2015-5-28 下午3:04:01 
     * @version 1.0
     * @param body
     * @return
     */
    private String getContentWithTag(String body) {
        String contentWithTag = ContentUtility.RemoveUselessHTMLTagX(body);
        return contentWithTag;
    }

    /**
     * 获取不带标签的正文
     * 创建人:  李东亮  
     * 创建时间:  2015-5-28 下午3:06:32 
     * @version 1.0
     * @param contentWithTag
     * @return
     */
    private String getContentNoTag(String contentWithTag) {
        return ContentUtility.TransferHTML2Text(contentWithTag);
    }



    /**
     * 去除html标签中的无效字符
     * 创建人:  李东亮  
     * 创建时间:  2015-7-1 下午1:05:52 
     * @version 1.0
     * @param html
     * @return
     */
    public String formatHtmlTag(String html) {
        Pattern p = Pattern.compile("<[\\d|\\w|\\/]*([^(\\d|\\w|\\/)]+)[\\d|\\w|\\/]*>");
        Matcher m = p.matcher(html);
        String g;
        while (m.find()) {
            g = m.group();
            html = html.replaceAll(g, g.replaceAll("[^(\\<|\\>|\\d|\\w|\\/)]+", ""));
        }
        return html;
    }




    
    /**
     * 抽取
     * 创建人:  李东亮  
     * 创建时间:  2016-4-7 下午2:00:19 
     * @version 1.0
     * @param entity
     * @return
     * @throws Exception
     */
    public void process(ExtEntity entity) throws Exception {
        //获取字符集，并把html片段转换为正确的编码
/*        Header header = getMethod.getResponseHeader("Content-Type");
        String   charset = CharsetUtil.getCharset(html,header);
        html = CharsetUtil.convertCorrectCharset(html,charset);
        if(!Constants.SHANGFEI_SUPPORT){
            html = CharsetUtil.converCharsetToUTF8(html,charset);
            charset=Constants.DEFAULT_CHARSET;
        }
        entity.setCharset(charset);
        Document jsoupDoc = Jsoup.parse(html);
        html = jsoupDoc.html();
        entity.setHtml(html);
        */
        //获取正文,标题,关键词,摘要,作者,发布时间，来源
        Processor processor;
        String ename;
        String result;
        String contentWithTag;
        String temp;
        for (Iterator<Processor> iterator = processors.iterator(); iterator.hasNext();) {
            processor = iterator.next();
            ename = processor.getExtType().getEname().toUpperCase();
            //标题
            if (ename.equals(EXT_TYPE.TITLE.toString())&&entity.getTitle()==null) {
                result = processor.extract(html);
                if (result != null) {
                    temp = this.getContentNoTag(result);
                    if (temp.length() > 0) {
                        entity.setTitle(temp);
                    }
                }
            }
            //作者
            else if (ename.equals(EXT_TYPE.AUTHOR.toString())&&entity.getAuthor()==null) {
                result = processor.extract(html);
                if (result != null) {
                    temp = this.getContentNoTag(result);
                    if (temp.length() > 0) {
                        entity.setAuthor(temp);
                    }
                }
            }
           //发布时间
            else if (ename.equals(EXT_TYPE.PUBLISH_DATE.toString())&&entity.getPublishDate()==null) {
                result = processor.extract(html);
                if (result != null) {
                    temp = this.getContentNoTag(result);
                    if (temp.length() > 0) {
                        entity.setPublishDate(temp);
                    }
                    else{
                    	if(html.contains("ar ct = \"")){
                    	int ii=html.indexOf("var ct = \"");
                    		
                    	temp=html.substring(html.indexOf("ar ct = \"")+9, html.indexOf("ar ct = \"")+19);
                    	if (temp.length() > 0) {
                    		String time= DateUtil.tiemString2String(temp+"000", true);
                            entity.setPublishDate(time);
                        }
                    	}
                    }
                }
            } 
           //来源
            else if (ename.equals(EXT_TYPE.ORIGIN.toString())&&entity.getOrigin()==null) {
                result = processor.extract(html);
                if (result != null) {
                    temp = this.getContentNoTag(result);
                    if (temp.length() > 0) {
                        entity.setOrigin(temp);
                    }
                }
            } 
          //关键词
            else if (ename.equals(EXT_TYPE.KEYWORDS.toString())&&entity.getKeywords()==null) {
                result = processor.extract(html);
                if (result != null) {
                    temp = this.getContentNoTag(result);
                    if (temp.length() > 0) {
                        entity.setKeywords(temp);
                    }
                }
            }
          //摘要
            else if (ename.equals(EXT_TYPE.SUMMARY.toString())&&entity.getSummary()==null) {
                result = processor.extract(html);
                if (result != null) {
                    temp = this.getContentNoTag(result);
                    if (temp.length() > 0) {
                        entity.setSummary(temp);
                    }
                }
            }
            //带标签正文
            else if (ename.equals(EXT_TYPE.CONTENT.toString())&&entity.getContentWithTag()==null) {
                result = processor.extract(html);
                if (result != null) {
                    contentWithTag = this.getContentWithTag(result);
                    //带标签正文
                    entity.setContentWithTag(contentWithTag);
                }
            }
        }
        
    }
    
    
    public String getContent(){
        return html;
    }
    
    public String getCharset(){
        return charset;
    }
}
