package com.zzsn.search.extractor;

import com.zzsn.docinfo.DocInfo;
import com.zzsn.utility.util.DateUtil;
import com.zzsn.utility.util.Utility;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.*;
import java.util.regex.Matcher;


/**
 * 标准化正文抽取实现
 * 创建人：冯胜利   
 * 算法原理：
 * 
 * 创建时间：2015-6-08 下午3:07:01   
 * 公司 ：郑州数能软件科技有限公司
 * @version  1.0  
 *
 */
public class StandardWebExtractorHandler {
	private static final Logger Log = LoggerFactory.getLogger(StandardWebExtractorHandler.class);

//	private String langType = "none";	//语言类型
//	private int thresholdLinkTextLen = 500; //
	//private int thresholdNormalTextRatio = 10;
	
	
	/**
	 * 相似度判断
	 * @param str1
	 * @param str2
	 * @return
	 */
	public static int ldforcn(String str1, String str2) {    
    	if(str1==null||str2==null){
    		return 0;
    	}
        int d=0;    //相同数
        int f=0;    //相同数   
        int n = str1.length();    
        int m = str2.length();    
        int i;    //遍历str1的    
        int j;    //遍历str2的    
        String ch1;    //str1的    
        String ch2;    //str2的    
        if(n == 0) {    
            return 0;    
        }    
        if(m == 0) {    
            return 0;    
        }    
        
        for(i=0; i<n; i++) {    //遍历str1    
            ch1 = str1.substring(i, i+1);    
            //去匹配str2  
            if(str2.contains(ch1)){
            	d++;
            }
              
        }    
        for(i=0; i<m; i++) {    //遍历str1    
            ch2 = str2.substring(i, i+1);    
            //去匹配str2  
            if(str1.contains(ch2)){
            	f++;
            }
              
        } 
        return Math.min(d,f);    
    }    
	
	/**
	 * 递归找标题
	 * @param htmlcontent
	 * @param oldtitle
	 * @return
	 */
	
	private static String findtitle(Element root,String oldtitle,List<String> resultMap)
	{
		try
		{
			if(resultMap.size()>0) {
				return "";
			}
			
			if(root==null)
			{
				return "";
			}
			//2) 判断childElem的基本属性： textLen, totalLen, text-linkTextLen, onlyNormalTag

			

			
			boolean bHaveChild = (root.children().size() > 0) ? true : false;
			for(Element childElem : root.children())
			{
				findtitle(childElem,oldtitle,resultMap);
			}
			
			//有子节点则PASS
			if(bHaveChild) {
				
			}else {
				//没有子节点 判断标题
				String oldText = root.text();
				
				int re=ldforcn(oldtitle, oldText);
			 Double ld=(double) re / Math.max(oldtitle.length(), oldText.length());    
			 
			 if(ld>0.6) {
				 resultMap.add(oldText);
				 return "";
			 }
				
				
			}
		}catch (Exception e) {
			e.printStackTrace();
			return "";
		}
		return "";
	}
	public static String getTitle(String htmlcontent,String oldtitle) {
		String newtitle="";
		//便利文本
		Document doc =  Jsoup.parse((Utility.RemoveStyleCode(htmlcontent)));
		Elements root=doc.getElementsByTag("body");
		List<String> resultMap = new ArrayList<String>();
		if(root.size()>0) {
			
			findtitle(root.get(0), oldtitle,resultMap);
			if(resultMap.size()>0) {
				return resultMap.get(0);
			}
		}
		return newtitle;
	}
	
	
	public DefaultMsg doHandler(String htmlContent,DocInfo docInfo){
	//	process.setCurrentprocess(Constants.EXTRACTOR);
		DefaultMsg dm = new DefaultMsg();
		//String 	htmlContent = FileUtil.readHtml(process.getReplayInputStream(),process.getCharset());
	//	String htmlContent = process.getHtml();
		//add lihuawei 新增标题判断，如果，标题包含三个点，，则去html中递归查找和标题相似的内容，这个就是正确的标题
		
		String oldtitle=docInfo.getTitle();
		if(oldtitle!=null&&oldtitle.length()>0&&oldtitle.contains("...")) {
			try {
				oldtitle=oldtitle.replace("...", "");
				String newtitle=getTitle(htmlContent, oldtitle);
				if(StringUtils.isNotEmpty(newtitle)) {
					docInfo.setTitle(newtitle);
					System.out.println("new---------"+newtitle);
				}
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			
		}
		
		//htmlContent=RemoveInvalidElementbeforedohandle(htmlContent);
		
		String contentWithTag = this.standardExtractor(htmlContent);
		if(contentWithTag == null || contentWithTag.trim().length()==0){
	   		dm.setMsg("解析html文本失败");
	   		dm.setSuccess(0);
	   		return dm;
	   	}
		/****************************为processItem元素赋值********************/
		docInfo.setContentWithTag(contentWithTag);
		docInfo.setContentNoTag(Utility.TransferHTML2Text(contentWithTag));
	/*	if(process.getContent()==null){
		    extEntity.setContent(Utility.TransferHTML2Text(contentWithTag));
		}
		if(process.getContentWithTag()==null){
			process.setContentWithTag(contentWithTag);
		}*/
		this.buildProcessItem(docInfo);
		//dm.setExtEntity(extEntity);
		return dm;

	}
	
	private String standardExtractor(String htmlContent) {
		if (htmlContent == null || htmlContent.trim().isEmpty()) {
			return "";
		}
		try {
			Document doc =  Jsoup.parse(htmlContent);
//			langType = Utility.getLanguageType(doc.text());
		String 	langType = "cn";
		int thresholdLinkTextLen;
		int thresholdNormalTextRatio;
			if(langType.equals("cn"))
			{
				thresholdLinkTextLen = 200;
				thresholdNormalTextRatio = 18;
			}
			else if(langType.equals("en"))//增加中英文及其他语言之分，非中英文则返回FALSE  刘丽芳  20140923
			{
				thresholdLinkTextLen = 500;
				thresholdNormalTextRatio = 10;
			}
			else{
				return null;
			}
				
			//1. 找到根节点
			//2. 依次判断每个Table or Div是否有可能形成正文，若可以，作为候选 
			//3. 对每个候选打分 
			//4. 输出最大的
				
			Elements tableElems = null;
			Elements divElems = null;
			Element candElem = null;
			List<Element> lstCandElem = new ArrayList<Element>();
				
			//候选Element的几个属性，包括：textLen,totalLen,nomralTextLen和normalTotalLen,nolinkTextLen
			List<Integer> lstTextLen = new ArrayList<Integer>();
			List<Integer> lstTotalLen = new ArrayList<Integer>();
			List<Integer> lstNormalTextLen = new ArrayList<Integer>();
			List<Integer> lstNoLinkTextLen = new ArrayList<Integer>();
			List<Integer> lstNormalTotalLen = new ArrayList<Integer>();
			int textLen = 0;
			int totalLen = 0;
			int normalTextLen = 0;
			int normalTotalLen = 0;
			int nolinkTextLen = 0;
			int maxTextLen = 0;
			int maxTotalLen = 0;
			int maxNormalTextLen = 0;
			int maxNormalTotalLen = 0;
			int maxNoLinkTextLen = 0;
			int maxIndex = -1;
			String contentWithTag = "";
				
			if((doc.select("table").size() ==0) && (doc.select("div").size() ==0))
			{
				maxIndex = 0;
				lstCandElem.add(doc);
			}
			else
			{
				for(Element childElem : doc.body().children())
				{
					
					if(((tableElems = childElem.select("table")).size()==0) && ((divElems = childElem.select("div")).size()==0))
					{
						continue;
					}
					
					//
					//2) 判断childElem的基本属性： textLen, totalLen, text-linkTextLen, onlyNormalTag
					textLen = Utility.TransferHTML2Text(childElem.html()).length();
					totalLen = childElem.outerHtml().length();
					contentWithTag = Utility.RemoveAllLink(childElem.html());	//可能还得更剧烈写，把所有的link都去掉！而不是只去无用链接
					normalTextLen = Utility.TransferHTML2Text(contentWithTag).length();
					normalTotalLen = Utility.RemoveUselessHTMLTag(contentWithTag).length();
					//3) 基本判断是否可以有 : 正常文本长度，带链接和不带链接的文本长度，
	     			if((normalTextLen < 100)) // || (textLen * 1.0/ normalTextLen > 2.0))
					{
						//PreProcessor.writeFile("E:\\test\\1.txt", "utf-8", Utility.TransferHTML2Text(childElem.html()));
						//PreProcessor.writeFile("E:\\test\\2.txt", "utf-8", Utility.TransferHTML2Text(contentWithTag));
						continue;
					}
					//4) 与 max比较，若相差太多，则去掉，否则加入候选
					if(maxTextLen == 0)
					{
						maxTextLen = textLen;
						maxTotalLen = totalLen;
						maxNormalTextLen = normalTextLen;
						maxNormalTotalLen = 0;
						maxNoLinkTextLen = normalTotalLen;
						maxIndex = 0;
						lstCandElem.add(childElem);
						continue;
					}
					//
					if(normalTextLen > maxNormalTextLen)
					{
						maxTextLen = textLen;
						maxTotalLen = totalLen;
						maxNormalTextLen = normalTextLen;
						maxNormalTotalLen = 0;
						maxNoLinkTextLen = normalTotalLen;
						lstCandElem.add(childElem);
						maxIndex = lstCandElem.size()-1;
						continue;
					}
					//
					if(maxNormalTextLen * 1.0 / normalTextLen > 2)
					{
						continue;
					}
				}
			}
			StringBuffer sb = new StringBuffer();
			sb.append("<html><head>");
			sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
			sb.append("<title></title></head><body>");
			//2. 开始对候选打分
			if(lstCandElem.size() > 0)
			{
				candElem = lstCandElem.get(maxIndex);
//				System.out.println(candElem.outerHtml());
				//1) 依次去掉不好的区
				
				Document docNew = Jsoup.parse(Utility.RemoveStyleCode(candElem.outerHtml()));
				
				//去掉隐藏的内容
				//style="display: none;"
				Elements hiddenElements = docNew.select("[type=hidden]");
				for(Element hElem : hiddenElements)
				{
					hElem.remove();
				}
				
				hiddenElements = docNew.select("[style=display: none;]");
				for(Element hElem : hiddenElements)
				{
					hElem.remove();
				}
					
				candElem = docNew.body();
				Map<String, String> countMap = new HashMap<String, String>();
				Map<String, String> resultMap = new HashMap<String, String>();
				RemoveInvalidElement(candElem, countMap, resultMap,thresholdNormalTextRatio);
				System.out.println("------------------------------");
//				System.out.println(candElem.outerHtml());
				//判断是否为贴子区域，并去除掉 重复class 的Node
				// class属性名称/childRensize/childNodeSzie 这三个值均相同
				Iterator it = resultMap.keySet().iterator();
				while (it.hasNext()) {
					String classKey = it.next().toString();
					int lastIndex = classKey.lastIndexOf("~");
					String className = classKey.substring(0, lastIndex);
					String childRenSzie = classKey.substring(lastIndex + 1, classKey.length());
					String classValue = resultMap.get(classKey).toString();
						
//					Log.info("catch it=============" + className);
					if (className != null && !className.isEmpty()) {
						Elements removeElements = candElem.getElementsByAttributeValue("class", className);
						for (Element element : removeElements) {
							int c_childrenSize = element.children() != null && element.children().size() > 0 ?
									element.children().get(0).children().size() : 0;
							String currentValue = element.childNodeSize() + "~" 
									+ c_childrenSize + "~" + element.parent().children().size();
							if (currentValue.equals(classValue) &&
									Integer.parseInt(childRenSzie) - element.children().size() == 0){
//								System.out.println(element.outerHtml());
								element.remove();
							}
						}
					}
				}
				System.out.println("------------------------------");
//				System.out.println(candElem.outerHtml());
				sb.append(candElem.outerHtml());
			}
			sb.append("</body></html>");
			return sb.toString();
		} catch (Exception ex) {
			ex.printStackTrace();
		}
		return "";
	}
	
	public  String RemoveInvalidElementbeforedohandle(String htmlContent) {
		try {
			Map<String, String> countMap = new HashMap<String, String>();
			Map<String, String> resultMap = new HashMap<String, String>();
			Document doc=(Jsoup.parse((Utility.RemoveStyleCode(htmlContent))));
			RemoveInvalidElement(doc, countMap, resultMap,18);
			StringBuffer sb = new StringBuffer();
			sb.append("<html><head>");
			sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
			sb.append("<title></title></head><body>");
			Iterator it = resultMap.keySet().iterator();
			while (it.hasNext()) {
				String classKey = it.next().toString();
				int lastIndex = classKey.lastIndexOf("~");
				String className = classKey.substring(0, lastIndex);
				String childRenSzie = classKey.substring(lastIndex + 1, classKey.length());
				String classValue = resultMap.get(classKey).toString();
					
//			Log.info("catch it=============" + className);
				if (className != null && !className.isEmpty()) {
					Elements removeElements = doc.getElementsByAttributeValue("class", className);
					for (Element element : removeElements) {
						int c_childrenSize = element.children() != null && element.children().size() > 0 ?
								element.children().get(0).children().size() : 0;
						String currentValue = element.childNodeSize() + "~" 
								+ c_childrenSize + "~" + element.parent().children().size();
						if (currentValue.equals(classValue) &&
								Integer.parseInt(childRenSzie) - element.children().size() == 0){
							element.remove();
						}
					}
				}
				sb.append(doc.outerHtml());
				}
			sb.append("</body></html>");
			return sb.toString();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return htmlContent;
	}
	
	
	/**
	 * 去除无用的 网页结构块
	 * @param root  网页 DOM 树的 节点
	 * @param countMap 装入 所有该 DOM 树下的 NODE 节点
	 * @param resultMap 返回需要去除的（相同子树结构）的 网页 NODE 节点
	 * @return
	 */
	private boolean RemoveInvalidElement(Element root, Map<String, String> countMap, Map<String, String> resultMap,int thresholdNormalTextRatio)
	{
		try
		{
			if(root==null)
			{
				return false;
			}
			//2) 判断childElem的基本属性： textLen, totalLen, text-linkTextLen, onlyNormalTag

			if(root.text().indexOf("Appointment to")>=0)
			{
				boolean bhere = true;
				bhere = false;
			}
			
			String oldText = root.text();
			int oldTextLen = Utility.TransferHTML2Text(root.html()).length();
			int oldTotalLen = root.outerHtml().length();

			
			boolean bHaveChild = (root.children().size() > 0) ? true : false;
			for(Element childElem : root.children())
			{
				RemoveInvalidElement(childElem, countMap, resultMap,thresholdNormalTextRatio);
			}

			if(((bHaveChild) && (root.tagName().equalsIgnoreCase("a")==false)) || ((root.tagName().equalsIgnoreCase("table")) || (root.tagName().equalsIgnoreCase("div"))))
			{
				int textLen = Utility.TransferHTML2Text(root.html()).length();
				int totalLen = root.outerHtml().length();
				String contentWithTag = Utility.aRemoveP.matcher(Utility.RemoveUselessLink(root.outerHtml())).replaceAll("").toLowerCase();//Utility.RemoveUselessLink(root.outerHtml());	//可能还得更剧烈写，把所有的link都去掉！而不是只去无用链接
				String contentNoTag = Utility.TransferHTML2Text(contentWithTag).trim().toLowerCase();
				int normalTextLen = contentNoTag.length();
				
				int normalTotalLen = Utility.RemoveUselessHTMLTag(contentWithTag).length();
				
				if((textLen == 0) || (normalTextLen == 0) || (textLen * 1.0 / normalTextLen >= 5) 
						|| (normalTotalLen * 1.0 / normalTextLen >= 10))
						
				{
						root.remove();
						return true;
				}
				
				if((normalTextLen < 120) && (oldTextLen / normalTextLen >= 1.8))
				{
					if((Utility.ContainDateInfo(contentNoTag)== false) &&!contentWithTag.contains("title")&&!contentWithTag.contains("<h1")&&!contentWithTag.contains("<h2"))
					{
						root.remove();
						return true;
					}
				}
				
				if(((normalTextLen <= 20) && (oldTotalLen * 1.0 / normalTextLen >= 30))
						|| ((normalTextLen > 20) && (normalTextLen <= 120) && (oldTotalLen * 1.0 / normalTextLen >= 26))
						|| ((normalTextLen > 120) &&(totalLen * 1.0 / normalTextLen >= thresholdNormalTextRatio))
						)
				{
					if((normalTextLen > 120) || (Utility.ContainDateInfo(contentNoTag)== false))
					{
						//判断标题
						if(contentWithTag.contains("title")) {
							return true;
						}
						root.remove();
						return true;
					}
				}

				if((normalTextLen <= 50) && 	
						((oldText.indexOf("上一")==0) || (oldText.indexOf("下一")==0) 
						|| (oldText.indexOf(">")>=0)
						|| (oldText.indexOf("相关")==0) || (oldText.indexOf("related")==0)
						|| (oldText.indexOf("[")==0) || (oldText.indexOf("【")==0)
						|| (oldText.indexOf("导航")>=0) || (oldText.indexOf("版权所有")>=0)
						|| (oldText.indexOf("|")>=0) || (oldText.indexOf("首页")==0)
						|| (oldText.indexOf("字体")==0) || (oldText.indexOf("font")==0)
						|| (oldText.indexOf("about us")>=0) || (oldText.indexOf("homepage")>=0)
						|| (oldText.indexOf("链接")>=0) || (oldText.indexOf("links")>=0)
						|| (oldText.indexOf(">")>=0) || (oldText.indexOf("??")>=0) 
						|| (oldText.indexOf("版权")>=0 ) 
						|| (oldText.trim().indexOf("热门跟贴")>=0 || (oldText.trim().indexOf("最新跟贴")>=0) ) 
						|| (oldText.trim().indexOf("频道推荐")>=0 || (oldText.trim().indexOf("智能推荐")>=0) )
//						|| (oldText.trim().indexOf("本文来源")>=0 || (oldText.trim().indexOf("来源:")>=0) )
						|| (oldText.indexOf("copyright")>=0) || (oldText.indexOf("&copy;")>=0) 
						|| (oldText.indexOf("(c)")>=0) || (oldText.indexOf("all rights reserved")>=0)
					))
				{
					root.remove();
					return true;
				}
				

				if((normalTextLen <= 150) &&
						((oldText.indexOf("版权所有")>=0)
					))
				{
					root.remove();
					return true;
				}
				
				
				// 相同树的判断依据： 子节点数目相同、部分树的root节点 Class 样式相同/并且父节点是同一个
				int childRenSize = root.children().size();		
				int c_childrenSize = root.children() != null && root.children().size() > 0 ?
						root.children().get(0).children().size() : 0;
						if(root.parent()==null) {
							//System.out.println("11111111kong");
							return true;
						}
						
				String classValue = root.childNodeSize() + "~" + 
						c_childrenSize + "~" + root.parent().children().size();
				String classKey = root.attr("class") + "~"  + childRenSize;
				if (!countMap.isEmpty() && countMap.containsKey(classKey)
						&& !root.attr("class").toString().isEmpty()) {
					if (classValue.equals(countMap.get(classKey).toString())) {
						resultMap.put(classKey, classValue);
					} else {
						countMap.put(classKey, classValue);
					}
				} else {
					countMap.put(classKey, classValue);
				}
				
				if((normalTextLen <= 15) && (Utility.isContainedWord(contentNoTag)==false))	
				{
					root.remove();
					return true;
				}

			}
			return true;
		}
		catch(Exception e)
		{
			Log.error("错误", e);
			e.printStackTrace();
			return false;
		}
	}

	
	/**
	 * 对Processitem 进一步构建、处理。如下：
	 * 1、获取 title/summary/date
	 * 2、去除正文(contentNoTag)中，对应title/date的语句 (暂不处理)
	 * 3、去除正文(contentNoTag)中，段落间多余的 换行符
	 * 
	 * @return
	 */
	private DocInfo buildProcessItem(DocInfo docInfo) {
		try {
			String langType = Utility.getLanguageType(docInfo.getContentNoTag());
			if ("error".equals(langType)) {
				return null;
			}
			StringBuffer sb = new StringBuffer();
			sb.append("<html><head>");
			sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
			sb.append("<title></title></head><body><div>\n");
			// article.setFlag(langType);
			boolean flg_title = false;
			boolean flg_date = false;
			// 标题
			boolean bFindTitle = (docInfo.getTitle() == null)
					|| (docInfo.getTitle().trim().equals("")) ? false : true;
			// 发布时间
			boolean bFindDate = (docInfo.getPublishDate() == null)
					|| (docInfo.getPublishDate().trim().equals("")) ? false
					: true;
			// 来源
			boolean bFindSourcesite = (docInfo.getOrigin() == null)
					|| (docInfo.getOrigin().trim().equals("")) ? false
					: true;
			// 摘要
			boolean bFindSummary = (docInfo.getSummary() == null)
					|| (docInfo.getSummary().trim().equals(""))
					|| (docInfo.getSummary().trim().replaceAll("[\\u00A0]", "")
							.isEmpty()) ? false : true;

			// 获得正文的第一句 作为标题
			String[] sentences = docInfo.getContentNoTag().split("\n");
			// int lineNum = 0;
			Matcher dateMatcher = null;
			String content = "";
			String contentWithTag = "";
			for (String sentence : sentences) {
				if (sentence.trim().isEmpty()) {
					continue;
				}
				if (Utility.patWordAndNum.matcher(sentence.trim()).find() == false) {
					continue;
				}
				if (!bFindSourcesite) {
					// 来源
					if (sentence.indexOf("来源:", 0) != -1) {
						String sourcesite = "";
						if (sentence.length()>(sentence.indexOf("来源:") + 3)) {
						sourcesite = sentence.substring(sentence
								.indexOf("来源:") + 3);
						String[] site = sourcesite.split(" ");
						for (int i = 0; i < site.length; i++) {
							if (site[i].length() != 0 && site[i].length() < 10) {
								sourcesite = site[i];
								sourcesite = sourcesite.replaceAll(":", "");
								break;
							}
						}
						if(StringUtils.isNotEmpty(sourcesite)) {
							sourcesite=sourcesite.replace("举报", "");
						}
						
						docInfo.setOrigin(sourcesite);
						bFindSourcesite = true;
						}
					} else if (sentence.indexOf("来源：", 0) != -1) {
						String sourcesite = "";
						if (sentence.length()>(sentence.indexOf("来源:") + 3)) {
						sourcesite = sentence.substring(sentence
								.indexOf("来源：") + 3);
						String[] site = sourcesite.split(" ");
						for (int i = 0; i < site.length; i++) {
							if (site[i].length() != 0 && site[i].length() < 10) {
								sourcesite = site[i];
								sourcesite = sourcesite.replaceAll("：", "");
								break;
							}
						}
						if(StringUtils.isNotEmpty(sourcesite)) {
							sourcesite=sourcesite.replace("举报", "");
						}
						
						docInfo.setOrigin(sourcesite);
						bFindSourcesite = true;
						}
					}
				}

				// 作者
				if (sentence.indexOf("作者", 0) != -1) {
					String author = "";
					if (sentence.length()>(sentence.indexOf("作者") + 3)) {
					author = sentence
							.substring(sentence.indexOf("作者") + 3);
					String[] aut = author.split(" ");
					for (int i = 0; i < aut.length; i++) {
						if (aut[i].length() != 0 && aut[i].length() < 8) {
							author = aut[i];
							docInfo.setAuthor(author);
							break;
						}
					}
					}
				} else if (sentence.indexOf("责任编辑", 0) != -1) {
					String author = "";
					if (sentence.length()>(sentence.indexOf("责任编辑") + 4)) {
						author = sentence.substring(sentence.indexOf("责任编辑") + 4);

					String[] aut = author.split(" ");
					for (int i = 0; i < aut.length; i++) {
						if (aut[i].length() != 0 && aut[i].length() < 8) {
							author = aut[i];
							docInfo.setAuthor(author);
							break;
						}
					}
				    }

				} else if (sentence.indexOf("责编", 0) != -1) {
					String author = "";
					if (sentence.length()>(sentence.indexOf("责编") + 3)) {
					author = sentence
							.substring(sentence.indexOf("责编") + 3);
					String[] aut = author.split(" ");
					for (int i = 0; i < aut.length; i++) {
						if (aut[i].length() != 0 && aut[i].length() < 8) {
							author = aut[i];
							docInfo.setAuthor(author);
							break;
						}
					}
					}
				} else if (sentence.indexOf("记者 ", 0) != -1) {
					String author = "";
					if (sentence.length()>(sentence.indexOf("记者 ") + 4)) {
					author = sentence
							.substring(sentence.indexOf("记者 ") + 4);
					String[] aut = author.split(" ");
					for (int i = 0; i < aut.length; i++) {
						if (aut[i].length() != 0 && aut[i].length() < 8) {
							author = aut[i];
							docInfo.setAuthor(author);
							break;
						}
					}
					}
				}

				// 判断是否日期，优先给日期，含日期的句子不能做标题，但是可以做摘要，如果已经有摘要的话可以。
				if (!bFindDate) {
					if ((dateMatcher = Utility.ContainedDateInfo(sentence)) != null) {
						String dat = DateUtil.getPublishDate(sentence);
						// 如果在句子前或句子末尾
						// if((dateMatcher.start() < 20) || (sentence.length() -
						// dateMatcher.end() < 10))
						// {
						if (dat == null || dat.length() == 0) {
						    docInfo.setPublishDate(dateMatcher.group()
									.replaceAll("年", "-").replaceAll("月", "-")
									.replaceAll("日", ""));
						} else {
						    docInfo.setPublishDate(dat);
						}

						// System.out.println("date:" + dateMatcher.group());
						bFindDate = true;
						continue;
						// }
					}
				}
				if (!bFindTitle) {

					if (sentence.length() < 50) {
						docInfo.setTitle(sentence.trim());
					}
					bFindTitle = true;
					continue;
				}

				//获取正文（去除标题）
				if (!flg_title) {

					if (sentence.length() < 50) {
						flg_title = true;
						continue;
					}
				}
				// 判断是否日期，优先给日期，含日期的句子不能做标题，但是可以做摘要，如果已经有摘要的话可以。
				if (!flg_date) {
					if ((dateMatcher = Utility.ContainedDateInfo(sentence)) != null) {
						// System.out.println("date:" + dateMatcher.group());
						flg_date = true;
						continue;
						// }
					}
				}
				contentWithTag += "<p>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"+sentence+"</p>\n";
				content += sentence+"\n";
				
				
				if (!bFindSummary) {
					if ((sentence.length() > 50)) {
					    docInfo.setSummary(sentence);
						bFindSummary = true;
						continue;
					}
				}
			}
			if (docInfo.getAuthor() != null) {
				if (docInfo.getAuthor().indexOf(")", 0) != -1) {
					String[] aut = docInfo.getAuthor().split("\\)");
					for (int i = 0; i < aut.length; i++) {
						if (aut[i].length() != 0 && aut[i].length() < 8) {
							String author = aut[i];
							docInfo.setAuthor(author);
							break;
						}
					}

				} else if (docInfo.getAuthor().indexOf("）", 0) != -1) {
					String[] aut = docInfo.getAuthor().split("）");
					for (int i = 0; i < aut.length; i++) {
						if (aut[i].length() != 0 && aut[i].length() < 8) {
							String author = aut[i];
							docInfo.setAuthor(author);
							break;
						}
					}

				}

			}
			sb.append(contentWithTag);
			sb.append("</div></body></html>");
//			if(docInfo.getContentNoTag()==null){
			docInfo.setContentWithTag(sb.toString());
			    docInfo.setContentNoTag(content);
//			}

		} catch (Exception ex) {
			Log.error("错误", ex);
			ex.printStackTrace();
		}
		return docInfo;
	}

}
