package com.zzsn.util;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.mozilla.universalchardet.UniversalDetector;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;


/*
 * 
 * Utility：一些工具函数的集合，
 * ver:2014.04.03
 * ver:2014.03.26
 * ver: 2014.03.14
 *  ver: 2014.3.11
 * ver:2014.03.09
 * ver:2014.03.05
 * ver: 2014.03.04
 * ver: 2014.02.19
 *  ver: 2013.11.19
 * ver: 2013.10.19
 * ver: 2013.09.24
 * ver: 2013.09.20
 *
 */
@SuppressWarnings("deprecation")
public class Utility {
	//任务执行状态flg
	public static int status_flg = 0;
	static String regEx = "[\\u4e00-\\u9fa5]";   
    static Pattern patChi = Pattern.compile(regEx);
    static String regExAll = "[a-zA-Z\\u4e00-\\u9fa5]";   
    static Pattern patWord= Pattern.compile(regExAll);
    static String regUnExAll = "[^a-zA-Z\\u4e00-\\u9fa5]";
    static Pattern patUnWord= Pattern.compile(regUnExAll);
    public static Pattern patWordAndNum= Pattern.compile("[0-9a-zA-Z\\u4e00-\\u9fa5]");  
	static HashMap<String,String> stemMap = null; 
	
	static String regHTMLNumcode = "&#(\\d{4,5});";   
    static Pattern patHTMLNumCode = Pattern.compile(regHTMLNumcode);
	
	//<div id="ctl00_PlaceHolderMain_ctl01_ctl05_label" style="display:none">Page Content</div>
	static Pattern divNoneP = Pattern.compile("(?s)<div[^>]*display:none[^>]*>.*?</div>",Pattern.CASE_INSENSITIVE);
	static Pattern divP = Pattern.compile("<div>",Pattern.CASE_INSENSITIVE);
	static Pattern divRP = Pattern.compile("</div>",Pattern.CASE_INSENSITIVE);
	static Pattern brP = Pattern.compile("<br />",Pattern.CASE_INSENSITIVE);
	static Pattern br2P = Pattern.compile("<br>",Pattern.CASE_INSENSITIVE);
	static Pattern spaceP = Pattern.compile("&nbsp;",Pattern.CASE_INSENSITIVE);
	static Pattern strongP = Pattern.compile("<strong>",Pattern.CASE_INSENSITIVE);
	static Pattern strongRP = Pattern.compile("</strong>",Pattern.CASE_INSENSITIVE);
	static Pattern pP = Pattern.compile("<p>",Pattern.CASE_INSENSITIVE);
	static Pattern pRP = Pattern.compile("</p>",Pattern.CASE_INSENSITIVE);

	static Pattern centerP = Pattern.compile("<center[^>]*>",Pattern.CASE_INSENSITIVE);
	static Pattern centerRP = Pattern.compile("</center>",Pattern.CASE_INSENSITIVE);
	
	static Pattern removeAttrP = Pattern.compile("<([a-zA-Z0-9]+)[^>]*>",Pattern.CASE_INSENSITIVE);
	static Pattern commentP = Pattern.compile("(?s)<!--[^>]*>.*?<![^>]*-->",Pattern.CASE_INSENSITIVE);
	static Pattern inputP = Pattern.compile("<input[^>]*>",Pattern.CASE_INSENSITIVE);
	static Pattern formP = Pattern.compile("<form[^>]*>",Pattern.CASE_INSENSITIVE);
	static Pattern formRP = Pattern.compile("</form>",Pattern.CASE_INSENSITIVE);
	static Pattern buttonP = Pattern.compile("(?s)<button[^>]*>.*?</button>",Pattern.CASE_INSENSITIVE);
	static Pattern iframeP = Pattern.compile("(?s)<iframe[^>]*>.*?</iframe>",Pattern.CASE_INSENSITIVE);
	static Pattern noscriptP = Pattern.compile("(?s)<noscript>.*?</noscript>",Pattern.CASE_INSENSITIVE);
	static Pattern objectP = Pattern.compile("(?s)<object[^>]*>.*?</object>",Pattern.CASE_INSENSITIVE);
	static Pattern linkP = Pattern.compile("(?s)<link[^>]*>",Pattern.CASE_INSENSITIVE);

	static Pattern imgReplaceP = Pattern.compile("<img([^>]*)>",Pattern.CASE_INSENSITIVE);
	static Pattern imgRevReplaceP = Pattern.compile("<_img([^>]*)>",Pattern.CASE_INSENSITIVE);
	static Pattern imgP = Pattern.compile("<img[^>]*>",Pattern.CASE_INSENSITIVE);
	static Pattern imgRP = Pattern.compile("</img>",Pattern.CASE_INSENSITIVE);
	public static Pattern aRemoveP = Pattern.compile("(?s)<a[^>]*>.*?</a>",Pattern.CASE_INSENSITIVE);
	static Pattern legendRemoveP = Pattern.compile("(?s)<legend[^>]*>.*?</legend>",Pattern.CASE_INSENSITIVE);
	
	static Pattern aP = Pattern.compile("<a[^>]*>",Pattern.CASE_INSENSITIVE);
	static Pattern aRP = Pattern.compile("</a>",Pattern.CASE_INSENSITIVE);
	static Pattern fontP = Pattern.compile("<font[^>]*>",Pattern.CASE_INSENSITIVE);
	static Pattern fontRP = Pattern.compile("</font>",Pattern.CASE_INSENSITIVE);
	static Pattern hP = Pattern.compile("<h\\d[^>]*>",Pattern.CASE_INSENSITIVE);
	static Pattern hRP = Pattern.compile("</h\\d>",Pattern.CASE_INSENSITIVE);
	static Pattern ulRP = Pattern.compile("</ul>",Pattern.CASE_INSENSITIVE);
	static Pattern liRP = Pattern.compile("</li>",Pattern.CASE_INSENSITIVE);
	static Pattern trRP = Pattern.compile("</tr>",Pattern.CASE_INSENSITIVE);
	static Pattern tdRP = Pattern.compile("</td>",Pattern.CASE_INSENSITIVE);
	
	static Pattern textareaRemoveP = Pattern.compile("(?s)<textarea[^>]*>.*?</textarea>",Pattern.CASE_INSENSITIVE);
	static Pattern selectRemoveP = Pattern.compile("(?s)<select[^>]*>.*?</select>",Pattern.CASE_INSENSITIVE);
	static Pattern optionRemoveP = Pattern.compile("(?s)<option[^>]*>.*?</option>",Pattern.CASE_INSENSITIVE);
	static Pattern labelRemoveP = Pattern.compile("(?s)<label[^>]*>.*?</label>",Pattern.CASE_INSENSITIVE);
	
	
	private static Pattern patDate0 = Pattern.compile("\\d+-\\d{1,2}-\\d+");
	private static Pattern patDate1 = Pattern.compile("\\d+[-\\s/年月日]\\d{1,2}-\\d+", Pattern.CASE_INSENSITIVE);
	private static Pattern patDate2 = Pattern.compile("\\d+\\s+[A-Z][a-z]+\\s+\\d+");
	private static Pattern patDate3 = Pattern.compile("[A-Z][a-z\\.]+\\s+\\d{1,2},\\s+\\d+");
	private static Pattern patDate4 = Pattern.compile("\\d+年\\d+月\\d+日");
	private static Pattern patDate5 = Pattern.compile("\\d+/\\d{1,2}/\\d+");
	private static Pattern patDate6 = Pattern.compile("\\d+\\.\\d+\\.\\d+");
	
	private static SimpleDateFormat formatter0 = new SimpleDateFormat("yyyy-MM-dd");
	private static SimpleDateFormat formatter0_1 = new SimpleDateFormat("yy-MM-dd");
	private static SimpleDateFormat formatter2 = new SimpleDateFormat("dd MMM yyyy",Locale.ENGLISH);
	private static SimpleDateFormat formatter3_1 = new SimpleDateFormat("MMM dd, yyyy",Locale.ENGLISH);
	private static SimpleDateFormat formatter3_2 = new SimpleDateFormat("MMM. dd, yyyy",Locale.ENGLISH);
	private static SimpleDateFormat formatter4 = new SimpleDateFormat("yyyy年MM月dd");
	private static SimpleDateFormat formatter5_1 = new SimpleDateFormat("yyyy/MM/dd");
	private static SimpleDateFormat formatter5_2 = new SimpleDateFormat("dd/MM/yyyy");
	private static SimpleDateFormat formatter5_4 = new SimpleDateFormat("yy/MM/dd");
	private static SimpleDateFormat formatter5_3 = new SimpleDateFormat("dd/MM/yy");
	private static SimpleDateFormat formatter6 = new SimpleDateFormat("yyyy.MM.dd");
	
	
	private static Date thresholdDate  = null;
	
	/*
	 * 判断网页文件的编码
	 */
	public static String getWebEncodingByStr(String content) {
		String encoding = "GB2312";
		Pattern p1 = Pattern.compile("<meta[^>]*>",
				Pattern.CASE_INSENSITIVE);
		Matcher m1 = p1.matcher(content);
		while (m1.find()) {
			String str = m1.group();
			Pattern p2 = Pattern.compile("charset[^\\s||\"||;||'||>]*");
			Matcher m2 = p2.matcher(str);
			if (m2.find()) {
				encoding = m2.group().substring(8);
				if (encoding.trim().length() == 0) {
					Pattern p3 = Pattern
							.compile("charset=\"[^\\s||\"||;||>]*");
					Matcher m3 = p3.matcher(str);
					if (m3.find()) {
						encoding = m3.group().substring(9);
					}
					if (encoding.trim().length() == 0) {
						// encoding = DetectCharSet.detectCharSet(fileName);
						// if(encoding == null){
						encoding = "GB2312";
						// }
					}
				}

				return encoding;
			}
		}

		return encoding;
	}
	
	/*
	 * 判断网页文件的编码
	 */
	public static String getWebEncoding(String fileName) {
		URL url;
		String encoding = null;
		try {
			InputStreamReader isR = null;
			isR = new InputStreamReader(new FileInputStream(fileName));

			StringBuffer sb = new StringBuffer();
			String line = null;
			BufferedReader br = new BufferedReader(isR);
			while ((line = br.readLine()) != null) {
				sb.append(line);
				sb.append("\r\n");
			}
			br.close();
			isR.close();

			Pattern p1 = Pattern.compile("<meta[^>]*>",
					Pattern.CASE_INSENSITIVE);

			String content = sb.toString();
			Matcher m1 = p1.matcher(content);
			while (m1.find()) {
				String str = m1.group();
				Pattern p2 = Pattern.compile("charset[^\\s||\"||;||'||>]*");
				Matcher m2 = p2.matcher(str);
				if (m2.find()) {
					encoding = m2.group().substring(8);
					if (encoding.trim().length() == 0) {
						Pattern p3 = Pattern
								.compile("charset=\"[^\\s||\"||;||>]*");
						Matcher m3 = p3.matcher(str);
						if (m3.find()) {
							encoding = m3.group().substring(9);
						}
						if (encoding.trim().length() == 0) {
							// encoding = DetectCharSet.detectCharSet(fileName);
							// if(encoding == null){
							encoding = "GB2312";
							// }
						}
					}

					return encoding;
				}
			}
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		if (encoding == null) {
			if (encoding == null) {
				encoding = "UTF-8"; //encoding = "GB2312";
			}
		}

		return encoding;
	}
	public static String getLanguageType(String content) {
		String langType = LangTypeDetector.DetectLang(content);
		if (langType == null || "".equals(langType)) {
			return "error";
		}
		return langType;
	}

	public static String detectCharSet(String fileName) {
		try
		{
			if(fileName == null)
			{
				return null;
			}
			byte[] buf = new byte[4096];

			FileInputStream fis = null;
			UniversalDetector detector = null;
			try {
				fis = new FileInputStream(fileName);
				detector = new UniversalDetector(null);

				int nread;
				while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
					detector.handleData(buf, 0, nread);
				}
			} catch (FileNotFoundException e) {
				//e.printStackTrace();
			} catch (IOException e) {
				//e.printStackTrace();
			}
			detector.dataEnd();

			String encoding = detector.getDetectedCharset();
			detector.reset();
			if (encoding == null) {
				// encoding = getFileEncoding(fileName);
				encoding = "GB2312";
			}
			return encoding;
		}
		catch(Exception e)
		{
			//	e.printStackTrace();
			return null;
		}
	}
	/*
	 * 判断文件的编码格式
	 */
	public static String getFileEncoding(String fileName) {
		return getWebEncoding(fileName);
	}

	public static String RemoveHTMLCode_old(String src) {
		src = src.replaceAll("<DIV>", "\n\n");
		src = src.replaceAll("</DIV>", "\n\n");
		src = src.replaceAll("<div>", "");
		src = src.replaceAll("</div>", "\n\n");
		src = src.replaceAll("<BR>", "\n\n");
		src = src.replaceAll("<br>", "\n\n");
		src = src.replaceAll("<br />", "\n\n");
		src = src.replaceAll("<BR />", "\n\n");
		src = src.replaceAll("&nbsp;", " ");
		src = src.replaceAll("<DIV>", "");
		src = src.replaceAll("<div>", "");
		src = src.replaceAll("&#8226;", "??");
		src = src.replaceAll("<STRONG>", "");
		src = src.replaceAll("</STRONG>", "");
		src = src.replaceAll("<strong>", "");
		src = src.replaceAll("</strong>", "");
		src = src.replaceAll("</p>", "\n\n");
		src = src.replaceAll("</P>", "\n\n");
		src = src.replaceAll("<P>", "\n\n");
		src = src.replaceAll("<p>", "\n\n");
		src = src.replaceAll("<a[^>]*>", "");
		src = src.replaceAll("<img[^>]*>", "");
		src = src.replaceAll("</a>", "");
		src = src.replaceAll("<font[^>]*>", "");
		src = src.replaceAll("</font>", "");
		src = src.replaceAll("<FONT[^>]*>", "");
		src = src.replaceAll("</FONT>", "");
		src = src.replaceAll("</h\\d>", "\n\n");
		src = src.replaceAll("</H\\d>", "\n\n");
		src = src.replaceAll("</ul>", "\n\n");
		src = src.replaceAll("</UL>", "\n\n");
		src = src.replaceAll("</li>", "\n\n");
		src = src.replaceAll("</LI>", "\n\n");
		
		src = src.replaceAll("</tr>", "\n");
		src = src.replaceAll("</TR>", "\n");
		
		
		src = src.replaceAll("<[^>]*>", "");
		
		return src.trim();
	}

	
	public static String RemoveHTMLCode(String src) {
		src = src.replaceAll("(<[^>]*>)\\s*(<[^>]*>)", "$1$2");
		src = divP.matcher(src).replaceAll("\n\n");
		src = divRP.matcher(src).replaceAll("\n\n");
		src = brP.matcher(src).replaceAll("\n\n");
		src = br2P.matcher(src).replaceAll("\n\n");
		src = spaceP.matcher(src).replaceAll(" ");
		src = src.replaceAll("&#8226;", "??");
		src = strongP.matcher(src).replaceAll("");
		src = strongRP.matcher(src).replaceAll("");
		src = pP.matcher(src).replaceAll("\n\n");
		src = pRP.matcher(src).replaceAll("\n\n");
		src = aP.matcher(src).replaceAll("");
		src = aRP.matcher(src).replaceAll("");
		src = imgP.matcher(src).replaceAll("");
		src = fontP.matcher(src).replaceAll("");
		src = fontRP.matcher(src).replaceAll("");
		src = hRP.matcher(src).replaceAll("\n\n");
		src = ulRP.matcher(src).replaceAll("\n\n");
		src = liRP.matcher(src).replaceAll("\n\n");
		src = trRP.matcher(src).replaceAll("\n\n");
		src = tdRP.matcher(src).replaceAll("\n\n");
		
		
		src = src.replaceAll("<[^>]*>", "");
		
		return src.trim();
	}

	public static String HTMLDecode(String str) {
		//
		// 去掉一些HTML编码
		str = str.replaceAll("&quot;", "\"");
		str = str.replaceAll("&nbsp;", " ");
		str = str.replaceAll("&middot;", "·");
		str = str.replaceAll("&amp;", "&");
		str = str.replaceAll("&ldquo;", "“");
		str = str.replaceAll("&rdquo;", "”");
		str = str.replaceAll("&gt;", ">");
		str = str.replaceAll("&lt;", "<");
		str = str.replaceAll("&raquo;", "??");
		str = str.replaceAll("&times;", "×");
		str = str.replaceAll("&ccedil;", "??");
		str = str.replaceAll("&atilde;", "??");
		str = str.replaceAll("&ecirc;", "ê");
		
		
		// 去掉<>
		//
		str = str.replaceAll("<\\?[^>]*>", "");

		

		
	    Matcher matcher = patHTMLNumCode.matcher(str);     
        while(matcher.find())
        {
        	str = matcher.replaceFirst(String.valueOf((char) Integer.parseInt(matcher.group(1))));
        	matcher = patHTMLNumCode.matcher(str);
        }
	    
	    /*
		
		String[] tmp = str.split(";&#|&#|;");
		StringBuffer sb = new StringBuffer("");

		for (int i = 0; i < tmp.length; i++) {
			if (tmp[i].matches("\\d{4,5}")) {
				sb.append((char) Integer.parseInt(tmp[i]));
			} else {
				sb.append(tmp[i]);
			}
		}
		str = sb.toString();
		*/
		return str;
	}
	
	public static String RemoveHTMLControl(String htmlText)
	{
		htmlText = textareaRemoveP.matcher(htmlText).replaceAll("");
		htmlText = selectRemoveP.matcher(htmlText).replaceAll("");
		htmlText = optionRemoveP.matcher(htmlText).replaceAll("");
		htmlText = labelRemoveP.matcher(htmlText).replaceAll("");
		htmlText = inputP.matcher(htmlText).replaceAll("");
		htmlText = formP.matcher(htmlText).replaceAll("");
		htmlText = buttonP.matcher(htmlText).replaceAll("");
		htmlText = formRP.matcher(htmlText).replaceAll("");
		return htmlText;
	}

	public static String RemoveStyleCode(String content) {

		try {
			Pattern p1 = Pattern.compile("(?s)<script\\s*.*?>(.*?)</script>",
					Pattern.CASE_INSENSITIVE);
			Matcher m1 = p1.matcher(content);
			content = m1.replaceAll("");

			Pattern p2 = Pattern.compile("(?s)<style\\s*.*?>(.*?)</style>",
					Pattern.CASE_INSENSITIVE);
			Matcher m2 = p2.matcher(content);
			content = m2.replaceAll("");

			Pattern p11 = Pattern.compile("(?s)<script\\s*.*?/>",
					Pattern.CASE_INSENSITIVE);
			Matcher m11 = p11.matcher(content);
			content = m11.replaceAll("");

			Pattern p21 = Pattern.compile("(?s)<style\\s*.*?/>",
					Pattern.CASE_INSENSITIVE);
			Matcher m21 = p21.matcher(content);
			content = m21.replaceAll("");
			

			

			content = noscriptP.matcher(content).replaceAll("");
			content = objectP.matcher(content).replaceAll("");
			content = linkP.matcher(content).replaceAll("");
			
			
			/*
			Pattern p22 = Pattern.compile("(?s)<img\\s*.*?/>",
					Pattern.CASE_INSENSITIVE);
			Matcher m22 = p22.matcher(content);
			content = m22.replaceAll("");
			*/
			// 去除注释
			// Pattern p3 = Pattern.compile("(?s)<!--\\s*.*?>(.*?)-->");
			Pattern p3 = Pattern.compile("(?s)<!--.*?-->");
			Matcher m3 = p3.matcher(content);
			content = m3.replaceAll("");
		} catch (Exception e) {
			e.printStackTrace();
		}
		return content;

	}

	public static String RemoveReturnCode(String src) {
		src = src.replaceAll("\r", "");
		src = src.replaceAll("\n", "");
		return src;
	}
	public static String RemoveHTMLReturnCode(String src)
	{
		//src = src.replaceAll("(<[^>]*>)[\r\n]+(<[^>]*>)", "$1$2");
		src = src.replaceAll("\r", "");
		src = src.replaceAll("\n", "");
		return src;
	}

	public static String AddHTMLLine(String content) {

		try {
			Pattern p1 = Pattern
					.compile("(?s)</div>", Pattern.CASE_INSENSITIVE);
			Matcher m1 = p1.matcher(content);
			content = m1.replaceAll("</div>\r\n");

			Pattern p2 = Pattern.compile("(?s)<div", Pattern.CASE_INSENSITIVE);
			Matcher m2 = p2.matcher(content);
			content = m2.replaceAll("<div\r\n");

			Pattern p3 = Pattern.compile("(?s)</p>", Pattern.CASE_INSENSITIVE);
			Matcher m3 = p3.matcher(content);
			content = m3.replaceAll("</p>\r\n");

			Pattern p4 = Pattern.compile("(?s)<p>", Pattern.CASE_INSENSITIVE);
			Matcher m4 = p4.matcher(content);
			content = m4.replaceAll("<p>\r\n");

			Pattern p5 = Pattern.compile("(?s)<br>", Pattern.CASE_INSENSITIVE);
			Matcher m5 = p5.matcher(content);
			content = m5.replaceAll("<br>\r\n");

			Pattern p6 = Pattern.compile("(?s)</li>", Pattern.CASE_INSENSITIVE);
			Matcher m6 = p6.matcher(content);
			content = m6.replaceAll("</li>\r\n");
		} catch (Exception e) {
			e.printStackTrace();
		}
		return content;
	}

	/**
	 * byte数组转换成16进制字符串
	 * 
	 * @param src
	 * @return
	 */
	public static String bytesToHexString(byte[] src) {
		StringBuilder stringBuilder = new StringBuilder();
		if (src == null || src.length <= 0) {
			return null;
		}
		for (int i = 0; i < src.length; i++) {
			int v = src[i] & 0xFF;
			String hv = Integer.toHexString(v);
			if (hv.length() < 2) {
				stringBuilder.append(0);
			}
			stringBuilder.append(hv);
		}
		return stringBuilder.toString();
	}

	/**
	 * 根据文件流读取文件真实类型
	 * 
	 * @param is
	 * @return
	 */
	public static String getTypeByStream(FileInputStream is) {
		byte[] b = new byte[7];
		try {
			is.read(b, 0, b.length);
		} catch (IOException e) {
			e.printStackTrace();
		}
		String type = bytesToHexString(b).toUpperCase();
		if (type.contains("3C21444F") || type.contains("3C68746D")
				|| type.contains("3C48544D")) {
			return "web";
		} else if (type.contains("D0CF11E0")) {
			return "word";
		} else if (type.contains("255044462D312E")) {
			return "pdf";
		} else if (type.contains("504B030414")) {
			return "word";
		} else if(type.contains("3C3F786D")) { //xml
			return null;
		}
		else {
			return "web";// return "unknown";
		}
	}
	
	

	public static String getFileType(String file) {
		FileInputStream is;
		try {
			is = new FileInputStream(file);
			String type = getTypeByStream(is);
			if(type.equals("word"))
			{
				//可能是doc,excel, or ppt
				String readType = file.substring(file.lastIndexOf(".")).toLowerCase();
				if(readType.equals(".doc") ||readType.equals(".docx") )
				{
					return "word"; 
				}
				else if(readType.equals(".xls") ||readType.equals(".xlsx"))
				{
					return "excel";
				}
				else if(readType.equals(".ppt") ||readType.equals(".pptx"))
				{
					return "ppt";
				}
			}
			is.close();
			return type;

		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return "unknown";
	}
	
	/**
	 * 获取文件扩展名
	 * 创建人:  李东亮  
	 * 创建时间:  2015-6-4 下午6:41:36 
	 * @version 1.0
	 * @param file
	 * @return
	 */
	public static String getFileExt(String file){
/*		int index = file.lastIndexOf(".");
		if((index >=0) && (index < file.length() - 1))
		{
			String str = file.substring(index+1);
			return "."+StringFilter(str);
		}
		else
		{
			return "";
		}*/
		return ".html";
	}

	public static String getFileName(String file) {
		if (file == null) {
			return null;
		}
		int index = file.lastIndexOf("\\");
		if (index < 0) {
			index = file.lastIndexOf("/");
			if(index < 0)
			{
				return file;
			}
		}
		return file.substring(index + 1);
	}
	
	public static String getFilePath(String file) {
		if (file == null) {
			return null;
		}
		int index = file.lastIndexOf("\\");
		if (index < 0) {
			index = file.lastIndexOf("/");
			if(index < 0)
			{
				return "";
			}
		}
		return file.substring(0,index+1);
	}
	

	public static String getFileTitle(String file) {
		if (file == null) {
			return null;
		}
		file = getFileName(file);
		int index = file.lastIndexOf(".");
		if (index < 0) {
			return file;
		} else if (index == 0) {
			return "";
		}
		return file.substring(0, index);
	}

	
	public static boolean isGoodEngSentence(String sentence)
	{
		if(sentence.length() < 10)
		{
			return false;
		}
		String newSent = sentence.replaceAll("[a-zA-Z ]", "").trim();
		if(newSent.length()*1.0 / sentence.length() > 0.3)
		{
			return false;
		}
		return true;
	}
	
	public static boolean isGoodEngSentenceX(String sentence,int minlen,int maxlen,int maxOtherLen)
	{
		if((sentence.length() < minlen) || (sentence.length() > maxlen))
		{
			return false;
		}
		String newSent = sentence.replaceAll("[a-zA-Z ]", "").trim();
		if((newSent.length()*1.0 / sentence.length() > 0.25) || (newSent.length()>maxOtherLen))
		{
			return false;
		}
		return true;
	}
	
	public static boolean isGoodChiSentence(String sentence)
	{
		if(sentence.length() <5)
		{
			return false;
		}
		String newSent = sentence.replaceAll("[\\u4e00-\\u9fa5 ]","").trim();
		if(newSent.length()*1.0 / sentence.length() > 0.4)
		{
			return false;
		}
		return true;
	}

	public static boolean isGoodChiSentenceX(String sentence,int minlen,int maxlen,int maxOtherLen)
	{
		if((sentence.length() < minlen) || (sentence.length() > maxlen))
		{
			return false;
		}
		String newSent = sentence.replaceAll("[\\u4e00-\\u9fa5 ]", "").trim();
		if((newSent.length()*1.0 / sentence.length() > 0.3) || (newSent.length()>maxOtherLen))
		{
			return false;
		}
		return true;
	}
	public static boolean isGoodSentence(String sentence)
	{
		if(sentence.length() <10)
		{
			return false;
		}
		String newSent = sentence.replaceAll("[a-zA-Z\\u4e00-\\u9fa5 ]","").trim();
		if(newSent.length()*1.0 / sentence.length() > 0.4)
		{
			return false;
		}
		return true;
	}
	
	public static boolean isGoodSentence_simple(String sentence)
	{
		String newSent = sentence.replaceAll("[a-zA-Z\\u4e00-\\u9fa5 ]","").trim();
		if(newSent.length()*1.0 / sentence.length() > 0.4)
		{
			return false;
		}
		return true;
	}

	public static List<String> getFiles(List<String> l, String directory,boolean bIncludeSubDir) {
		if (l == null) {
			l = new ArrayList<String>();
		}

		File file = new File(directory);

		if (file.isDirectory()) {
			String[] children = file.list();
			File childFile ;
			for (int i = 0; i < children.length; i++) {
				if(bIncludeSubDir)
				{
					getFiles(l, new File(file, children[i]).getAbsolutePath(),bIncludeSubDir);
				}
				else if((childFile = new File(file, children[i])).isFile())
				{
					l.add(childFile.getAbsolutePath());
				}
			}
		} else {
			if (file.isFile()) {
				l.add(directory);
			}
		}

		return l;
	}
	
	
	public static boolean getFiles(String directory,BufferedWriter bw) {
		try
		{
			File file = new File(directory);
			if (file.isDirectory()) {
				String[] children = file.list();
				for (int i = 0; i < children.length; i++) {
					getFiles(new File(file, children[i]).getAbsolutePath(),bw);
				}
			} else {
				if (file.isFile()) {
					bw.write(directory);
					bw.newLine();
				}
			}
			return true;
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
		return false;
	}
	
	public static boolean copyFile(String inFile,String outFile){
		try
		{
		 // long t1=System.currentTimeMillis();
          File file=new File(inFile);
          FileChannel out=new FileOutputStream(new File(outFile)).getChannel();
       
          FileInputStream input=new FileInputStream(file);
          //MappedByteBuffer buffer=new FileInputStream(file).getChannel().map(FileChannel.MapMode.READ_ONLY,0,file.length());
          //     buffer.load();
          MappedByteBuffer buffer=input.getChannel().map(FileChannel.MapMode.READ_ONLY,0,file.length());
               buffer.load();
          
	        //Charset charset=Charset.defaultCharset();
	        //Charset charset=Charset.forName("GBK");
	        //CharBuffer charBuffer=charset.decode(buffer);
	        //System.out.println(charBuffer);
          out.write(buffer);
          buffer=null;
          out.close();
          
          //System.out.println("花费时间"+(System.currentTimeMillis()-t1)+"测试");
          return true;
		}
		catch(Exception e)
		{
			e.printStackTrace();
			return false;
		}
	}
	
	public static boolean isContainedChiWord(String text)
	{
        Matcher matcher = patChi.matcher(text);     
        return matcher.find();
	}
	
	public static boolean isContainedWord(String text)
	{
        Matcher matcher = patWord.matcher(text);     
        return matcher.find();
	}
	
	public static boolean isContainedOnlyWord(String text)
	{
        Matcher matcher = patUnWord.matcher(text);     
        return (matcher.find()==false);
	}
	
	public static String stemming(String word)
	{
		if(Utility.stemMap == null)
		{
			try
			{
				Pattern p = Pattern.compile("(.+?)[\\s]+(.+)");
				Matcher m;
				FileInputStream fin = new FileInputStream("data/model_eng.txt");
				InputStreamReader ir = new InputStreamReader(fin,"UTF-8");
				BufferedReader br = new BufferedReader(ir);
				String temp;
				stemMap = new HashMap<String,String>();
				while((temp = br.readLine()) != null)
				{
					m = p.matcher(temp);
					if(m.find())
						stemMap.put(m.group(1), m.group(2));
					
				}
				ir.close();
				fin.close();
			}
			catch(Exception e)
			{
				System.out.println("Initialize stemming failed!");
				stemMap = null;
				return word.trim();
			}
		}
		if(stemMap.containsKey(word.trim()))
			return stemMap.get(word.trim());
		else
		{
			return word.trim();
		}
	}
	
	public static String stemmingText(String text)
	{
		String[] words = text.split(" ");
		String result = "";
		for(String word : words)
		{
			result += stemming(word) + " ";
		}
		return result;
	}
	
	
	  public static <K, V extends Comparable<V>> Map<K, V> sortByValueDesc(Map<K, V> map) 
	  {

        List<Entry<K, V>> list = new LinkedList<Entry<K, V>>(map.entrySet());
        Collections.sort(list, new Comparator<Entry<K, V>>(){
            public int compare(Entry<K, V> o1, Entry<K, V> o2) {
                Comparable<V> v2 = o2.getValue();
                V v1 = o1.getValue();
                if (v2 == null) {
                    if (v1 == null) {
                        return 0;
                    } else {
                       return -1;
                    }
                } else {

                    if (v1 == null) {

                        return 1;

                    } else {

                        return v2.compareTo(v1);

                    }

                }

            }

        });

        Map<K, V> result = new LinkedHashMap<K, V>();

        Iterator<Entry<K, V>> it = list.iterator();

        while (it.hasNext()) {

            Entry<K, V> entry = it.next();

            result.put(entry.getKey(), entry.getValue());

        }

        return result;

    }
	
	/**
	 * 提取html字符串转中的普通文本，注意处理其中的回车符 
	 * @param htmlText
	 * @return
	 */
	public static String TransferHTML2Text(String htmlText)
	{
 		String text = Utility.HTMLDecode(Utility.RemoveHTMLCode(Utility.RemoveStyleCode(Utility.RemoveHTMLReturnCode(htmlText))));
		text = text.replaceAll(" 　　", "\r\n");

		text = text.replaceAll(" +\r\n", "\r\n");
		text = text.replaceAll(" +", " ");
		text = text.replaceAll("[\\u00A0\\u3000]", "");
		text = text.replaceAll("　", "");

		return text;
	}
	
	/**
	 * 去掉无用的HTML标签，包括Img,a等
	 * @param htmlText
	 * @return
	 */
	public static String RemoveUselessHTMLTag(String htmlText)
	{
		try
		{
			/*
			htmlText = htmlText.replaceAll("<([a-zA-Z0-9]+)[^>]*>", "<$1>");
			htmlText = htmlText.replaceAll("(?s)<!--[^>]*>.*?<![^>]*-->", "");
			htmlText = htmlText.replaceAll("(?s)<input>", "");
			htmlText = htmlText.replaceAll("(?s)<form>", "");
			htmlText = htmlText.replaceAll("(?s)</form>", "");
			
			//htmlText = htmlText.replaceAll("(?s)<a>.*?</a>", "");
			htmlText = htmlText.replaceAll("<a>", "<span>");
			htmlText = htmlText.replaceAll("</a>", "</span>");
			htmlText = htmlText.replaceAll("(?s)<iframe>.*?</iframe>", "");
			htmlText = htmlText.replaceAll("(?s)<noscript>.*?</noscript>", "");
			
			//Pattern pat =  Pattern.compile("(?s)<object[^>]*>.*?</object>");
			//htmlText = "<object ><param></param> <param></param> <param></param> <param></param> \r\n<param></param><param></param><param></param><video></video></object>ddd";
			//htmlText = pat.matcher(htmlText).replaceAll("");
			htmlText = htmlText.replaceAll("(?s)<object[^>]*>.*?</object>", "");
			
			
			htmlText = htmlText.replaceAll("<img[^>]*>", "");
			htmlText = htmlText.replaceAll("</img>", "");
			htmlText = htmlText.replaceAll("(?s)<div[^>]*>\\s*</div>", "");
			htmlText = htmlText.replaceAll("(?s)<p[^>]*>\\s*</p>", "");
			
			
			//htmlText = htmlText.replaceAll(" 　　", "\r\n");
			//htmlText = htmlText.replaceAll(" +\r?\n", "\r\n");			
			//htmlText = htmlText.replaceAll("\r?\n+", "\r\n");
			 * 
			 */
			
			htmlText = Utility.RemoveStyleCode(htmlText);
			
			htmlText = htmlText.replaceAll("&nbsp;"," ");
			
			
			htmlText = divNoneP.matcher(htmlText).replaceAll("");
			
			htmlText = textareaRemoveP.matcher(htmlText).replaceAll("");
			htmlText = selectRemoveP.matcher(htmlText).replaceAll("");
			htmlText = optionRemoveP.matcher(htmlText).replaceAll("");
			htmlText = labelRemoveP.matcher(htmlText).replaceAll("");
			htmlText = inputP.matcher(htmlText).replaceAll("");
			htmlText = formP.matcher(htmlText).replaceAll("");
			htmlText = buttonP.matcher(htmlText).replaceAll("");
			htmlText = formRP.matcher(htmlText).replaceAll("");
			
			
			htmlText = removeAttrP.matcher(htmlText).replaceAll("<$1>");
			htmlText = commentP.matcher(htmlText).replaceAll("");
			htmlText = legendRemoveP.matcher(htmlText).replaceAll("");
			
			

			htmlText = aP.matcher(htmlText).replaceAll("<sapn>");
			htmlText = aRP.matcher(htmlText).replaceAll("</sapn>");
			htmlText = iframeP.matcher(htmlText).replaceAll("");
			htmlText = noscriptP.matcher(htmlText).replaceAll("");
			htmlText = objectP.matcher(htmlText).replaceAll("");
			htmlText = imgP.matcher(htmlText).replaceAll("");
			htmlText = imgRP.matcher(htmlText).replaceAll("");

			htmlText = centerP.matcher(htmlText).replaceAll("");
			htmlText = centerRP.matcher(htmlText).replaceAll("");
			
			htmlText = htmlText.replaceAll("<cufontext>", "");
			htmlText = htmlText.replaceAll("</cufontext>", "");
			htmlText = htmlText.replaceAll("<cufon>", "");
			htmlText = htmlText.replaceAll("</cufon>", "");
			
			
			//htmlText = htmlText.replaceAll("(?s)<([a-zA-Z0-9]+)[^>]*>\\s*(</$1>)", "");
			
			htmlText = htmlText.replaceAll("(?s)<ul[^>]*>\\s*</ul>", "");
			htmlText = htmlText.replaceAll("(?s)<div[^>]*>\\s*</div>", "");
			htmlText = htmlText.replaceAll("(?s)<p[^>]*>\\s*</p>", "");
			htmlText = htmlText.replaceAll("(?s)<li[^>]*>\\s*</li>", "");
			htmlText = htmlText.replaceAll("(?s)<canvas[^>]*>\\s*</canvas>", "");

			return htmlText;
		}
		catch(Exception e)
		{
			e.printStackTrace();
			return htmlText;	
		}
	}
	
	
	/**
	 * 去掉无用的HTML标签，包括a等
	 * @param htmlText
	 * @return
	 */
	public static String RemoveUselessHTMLTagX(String htmlText)
	{
		try
		{
			htmlText = Utility.RemoveStyleCode(htmlText);
			
			htmlText = htmlText.replaceAll("&nbsp;"," ");
			
			
			htmlText = divNoneP.matcher(htmlText).replaceAll("");
			
			htmlText = textareaRemoveP.matcher(htmlText).replaceAll("");
			htmlText = selectRemoveP.matcher(htmlText).replaceAll("");
			htmlText = optionRemoveP.matcher(htmlText).replaceAll("");
			htmlText = labelRemoveP.matcher(htmlText).replaceAll("");
			htmlText = inputP.matcher(htmlText).replaceAll("");
			htmlText = formP.matcher(htmlText).replaceAll("");
			htmlText = buttonP.matcher(htmlText).replaceAll("");
			htmlText = formRP.matcher(htmlText).replaceAll("");
			
			
			htmlText = imgReplaceP.matcher(htmlText).replaceAll("<_img$1>");
			htmlText = removeAttrP.matcher(htmlText).replaceAll("<$1>");
			htmlText = imgRevReplaceP.matcher(htmlText).replaceAll("<img$1>");
			
			htmlText = commentP.matcher(htmlText).replaceAll("");
			htmlText = legendRemoveP.matcher(htmlText).replaceAll("");
			
			

			htmlText = aP.matcher(htmlText).replaceAll("<sapn>");
			htmlText = aRP.matcher(htmlText).replaceAll("</sapn>");
			htmlText = iframeP.matcher(htmlText).replaceAll("");
			htmlText = noscriptP.matcher(htmlText).replaceAll("");
			htmlText = objectP.matcher(htmlText).replaceAll("");
			//htmlText = imgP.matcher(htmlText).replaceAll("");
			//htmlText = imgRP.matcher(htmlText).replaceAll("");

			htmlText = centerP.matcher(htmlText).replaceAll("");
			htmlText = centerRP.matcher(htmlText).replaceAll("");
			
			htmlText = htmlText.replaceAll("<cufontext>", "");
			htmlText = htmlText.replaceAll("</cufontext>", "");
			htmlText = htmlText.replaceAll("<cufon>", "");
			htmlText = htmlText.replaceAll("</cufon>", "");
			
			
			//htmlText = htmlText.replaceAll("(?s)<([a-zA-Z0-9]+)[^>]*>\\s*(</$1>)", "");
			
			htmlText = htmlText.replaceAll("(?s)<ul[^>]*>\\s*</ul>", "");
			htmlText = htmlText.replaceAll("(?s)<div[^>]*>\\s*</div>", "");
			htmlText = htmlText.replaceAll("(?s)<p[^>]*>\\s*</p>", "");
			htmlText = htmlText.replaceAll("(?s)<li[^>]*>\\s*</li>", "");
			htmlText = htmlText.replaceAll("(?s)<canvas[^>]*>\\s*</canvas>", "");

			return htmlText;
		}
		catch(Exception e)
		{
			e.printStackTrace();
			return htmlText;	
		}
	}
	
	
	
	
	public static String RemoveUselessLink(String contentWithTag)
	{
		Document doc =  Jsoup.parse(contentWithTag);
		Elements contentElems = doc.select("a");
		if((contentElems == null) || (contentElems.size() == 0))
		{
			return contentWithTag;
		}
		for(Element aElement : contentElems)
		{
			try
			{
				String elementText = aElement.text().trim();
				Element parentElement = aElement.parent();
				String parentText = parentElement.text().trim();
				elementText = elementText.replaceAll("　", "").trim();
				parentText = parentText.replaceAll("　", "").trim();		
				if(parentText.equals(elementText))
				{
					aElement.remove();
				}
				else 
				{
					parentText = Utility.aRemoveP.matcher(parentElement.html()).replaceAll("");
					parentText = Utility.TransferHTML2Text(parentText);
					if(Utility.patWordAndNum.matcher(parentText).find() == false)
					{
						parentElement.remove();
					}
				}
				while(parentElement.text().trim().isEmpty())
				{
					Element tempElement = parentElement;
					parentElement = parentElement.parent();
					tempElement.remove();
				}
			}
			catch(Exception e)
			{
				continue;
			}
		}
		
		
		
		return doc.outerHtml();
	
	}
	
	public static String RemoveAllLink(String contentWithTag)
	{
		Document doc =  Jsoup.parse(contentWithTag);
		Elements contentElems = doc.select("a");
		if((contentElems == null) || (contentElems.size() == 0))
		{
			return contentWithTag;
		}
		for(Element aElement : contentElems)
		{
			try
			{
				String elementText = aElement.text().trim();
				Element parentElement = aElement.parent();
				String parentText = parentElement.text().trim();
				elementText = elementText.replaceAll("　", "").trim();
				parentText = parentText.replaceAll("　", "").trim();		
				aElement.remove();
				while(parentElement.text().trim().isEmpty())
				{
					Element tempElement = parentElement;
					parentElement = parentElement.parent();
					tempElement.remove();
				}
			}
			catch(Exception e)
			{
				continue;
			}
		}
		
		
		
		return doc.outerHtml();
	
	}
	
	
	
	public static boolean ContainDateInfo_BAK(String content)
	{
		try
		{
			Matcher dateMatcher = null;
			if ((dateMatcher = patDate0.matcher(content)).find() 
					|| (dateMatcher = patDate1.matcher(content)).find()
					|| (dateMatcher = patDate2.matcher(content)).find() 
					|| (dateMatcher = patDate3.matcher(content)).find() 
					|| (dateMatcher = patDate4.matcher(content)).find()
					|| (dateMatcher = patDate5.matcher(content)).find()
					|| (dateMatcher = patDate6.matcher(content)).find() 
			)
			{
				return true;
			}
			return false;
		}
		catch(Exception e)
		{
			e.printStackTrace();
			return false;
		}
	}
	
	public static boolean ContainDateInfo(String content)
	{
		try
		{
			Matcher dateMatcher = null;
			if (((dateMatcher = patDate0.matcher(content)).find() && (Utility.transDate(dateMatcher.group(),0)!= null)) 
					|| ((dateMatcher = patDate1.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),1)!= null))
					|| ((dateMatcher = patDate2.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),2)!= null)) 
					|| ((dateMatcher = patDate3.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),3)!= null))
					|| ((dateMatcher = patDate4.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),4)!= null))
					|| ((dateMatcher = patDate5.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),5)!= null))
					|| ((dateMatcher = patDate6.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),6)!= null))
			)
			{
				return true;
			}
			return false;
		}
		catch(Exception e)
		{
			e.printStackTrace();
			return false;
		}
	}

	
	
	public static Matcher ContainedDateInfo_BAK(String content)
	{
		try
		{
			Matcher dateMatcher = null;
			if ((dateMatcher = patDate0.matcher(content)).find() 
					|| (dateMatcher = patDate1.matcher(content)).find()
					|| (dateMatcher = patDate2.matcher(content)).find() 
					|| (dateMatcher = patDate3.matcher(content)).find() 
					|| (dateMatcher = patDate4.matcher(content)).find()
					|| (dateMatcher = patDate5.matcher(content)).find()
					|| (dateMatcher = patDate6.matcher(content)).find() 
			)
			{
				return dateMatcher;
			}
			return null;
		}
		catch(Exception e)
		{
			e.printStackTrace();
			return null;
		}
	}
	
	
	public static Matcher ContainedDateInfo(String content)
	{
		try
		{
			Matcher dateMatcher = null;
			if (((dateMatcher = patDate0.matcher(content)).find() && (Utility.transDate(dateMatcher.group(),0)!= null)) 
					|| ((dateMatcher = patDate1.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),1)!= null))
					|| ((dateMatcher = patDate2.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),2)!= null)) 
					|| ((dateMatcher = patDate3.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),3)!= null))
					|| ((dateMatcher = patDate4.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),4)!= null))
					|| ((dateMatcher = patDate5.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),5)!= null))
					|| ((dateMatcher = patDate6.matcher(content)).find()&& (Utility.transDate(dateMatcher.group(),6)!= null))
			)
			{
				return dateMatcher;
			}
			return null;
		}
		catch(Exception e)
		{
			e.printStackTrace();
			return null;
		}
	}
	
	public static Date transDate(String source,int type)
	{
		try
		{
			
			if(thresholdDate == null)
			{
				thresholdDate = formatter0.parse("1970-01-01");
			}
			Date date = null;
			switch(type)
			{
			case 0:
				date = formatter0.parse(source);
				
				if(date.before(thresholdDate))
				{
					date = formatter0_1.parse(source);
					if(date.before(thresholdDate))
					{
						return null;
					}
				}
				
				break;
			case 1:
				//date = formatter1.parse(source);
				break;
			case 2:
				date = formatter2.parse(source);
				break;
			case 3:
				try
				{
					date = formatter3_1.parse(source);
				}
				catch(Exception e)
				{
					date = null;
				}
				if(date == null)
				{
					date = formatter3_2.parse(source);
				}
				break;
			case 4:
				date = formatter4.parse(source);
				break;
			case 5:
				try
				{
					date = formatter5_1.parse(source);
				}
				catch(Exception e)
				{
					date = null;
				}
				if((date == null) || (date.before(thresholdDate)))
				{
					date = formatter5_2.parse(source);
				}
				if((date == null) || (date.before(thresholdDate)))
				{
					date = formatter5_3.parse(source);
				}
				if((date == null) || (date.before(thresholdDate)))
				{
					date = formatter5_4.parse(source);
				}
				break;
			case 6:
				date = formatter6.parse(source);
				break;
			}
			
			if((date != null) && (date.before(thresholdDate)))
			{
				return null;
			}
			
			return date;
		}
		catch(Exception e)
		{
			return null;
		}
	}
	
	
	public static Date transDate(String content)
	{
		try
		{
			Matcher dateMatcher = null;
			Date date = null;
			if (((dateMatcher = patDate0.matcher(content)).find() && ((date = Utility.transDate(dateMatcher.group(),0))!= null)) 
					|| ((dateMatcher = patDate1.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),1))!= null))
					|| ((dateMatcher = patDate2.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),2))!= null)) 
					|| ((dateMatcher = patDate3.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),3))!= null))
					|| ((dateMatcher = patDate4.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),4))!= null))
					|| ((dateMatcher = patDate5.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),5))!= null))
					|| ((dateMatcher = patDate6.matcher(content)).find()&& ((date = Utility.transDate(dateMatcher.group(),6))!= null))
			)
			{
				return date;
			}
			return null;
		}
		catch(Exception e)
		{
			e.printStackTrace();
			return null;
		}
	}
	
	public static String transStandardDate(String content)
	{
		try
		{
			Date date = transDate(content); 
			if(date != null)
			{
				try
				{
					String dateStr = formatter0.format(date);
					return dateStr;
				}
				catch(Exception e)
				{
					return null;
				}
			}
			return null;
		}
		catch(Exception e)
		{
			return null;
		}
	}

	/**
	 * 获取父路径
	 * 创建人:  李东亮  
	 * 创建时间:  2015-7-6 下午3:17:44 
	 * @version 1.0
	 * @param path
	 * @return
	 * @throws IOException 
	 */
	public static String getDirPath(String path) {
		path = path.substring(0, path.lastIndexOf("/")) ;
		return path;
	}
	/**
	 * 去除特殊字符
	 * 创建人:  刘小鹏 
	 * 创建时间:  2015-6-4 下午6:40:19 
	 * @version 1.0
	 * @param str
	 * @return
	 * @throws PatternSyntaxException
	 */
    public   static   String StringFilter(String   str)   throws   PatternSyntaxException   {      
        // 只允许字母和数字        
        // String   regEx  =  "[^a-zA-Z0-9]";                      
           // 清除掉所有特殊字符   
			  String regEx="[?`~!@#$%^&*()+=|{}':;',//[//].<>/?~！@#￥%……&*（）——+|{}【】‘；：”“’。，、？]";   
			  Pattern   p   =   Pattern.compile(regEx);      
			  Matcher   m   =   p.matcher(str);      
			  return   m.replaceAll("").trim();      
  }  

    /**
     * 
     * 创建人:  李东亮  
     * 创建时间:  2015-10-28 上午9:35:01 
     * @version 1.0
     * @return
     */
    public static String convertCharset(String content,String sourceCharset,String targetCharset) throws UnsupportedEncodingException{
    	byte[] newtemp = new String(content.getBytes(sourceCharset), sourceCharset).getBytes(targetCharset);
		String result = new String(newtemp, targetCharset);
    	return result;
    }

	/**
	 * 根据
	 * 创建人:  杨海龙  
	 * 创建时间:  2015年7月10日 上午10:14:52 
	 * @version 1.0
	 * @param sourceaddress
	 * @return
	 */
	public static String getFileSuffix(String sourceaddress) {
		if (sourceaddress.lastIndexOf(".") == -1) {
			return null;
		}
		String suffix = sourceaddress.substring(sourceaddress.lastIndexOf("."), sourceaddress.length());
		if (null!= suffix && (".pdf".equals(suffix.toLowerCase()) ||
				".doc".equals(suffix.toLowerCase()) ||
				".docx".equals(suffix.toLowerCase()) ||
				".ppt".equals(suffix.toLowerCase()) ||
				".pptx".equals(suffix.toLowerCase()) ||
				".xls".equals(suffix.toLowerCase()) ||
				".xlsx".equals(suffix.toLowerCase()) 
				)) {
			return suffix.toLowerCase();
		}
		return null;
	}
	
    /**
     * 编码匹配
     * @version 1.0
     * @param
     * @return
     */
	public static String charsetcheck(String charset) {
		String charreset = "GB2312";
		String[] charsetall = {"GB2312","GBK","UTF-8","ISO-8859-1",
				"UTF-16","UTF-16BE","UTF-16LE","UTF-32","UTF-32BE",
				"UTF-32LE","ISO-2022-CN","ISO-2022-JP","ISO-2022-JP-2",
				"ISO-2022-KR","ISO-8859-13","ISO-8859-15","ISO-8859-2",
				"ISO-8859-3","ISO-8859-4","ISO-8859-5","ISO-8859-6",
				"ISO-8859-7","ISO-8859-8","ISO-8859-9","Big5","Big5-HKSCS",
				"EUC-JP","EUC-KR","GB18030","IBM-Thai","IBM00858","IBM01140",
				"IBM01141","IBM01142","IBM01143","IBM01144","IBM01145",
				"IBM01146","IBM01147","IBM01148","IBM01149","IBM037","IBM1026",
				"IBM1047","IBM273","IBM277","IBM278","IBM280","IBM284","IBM285",
				"IBM290","IBM297","IBM420","IBM424","IBM437","IBM500","IBM775",
				"IBM850","IBM852","IBM855","IBM857","IBM860","IBM861","IBM862",
				"IBM863","IBM864","IBM865","IBM866","IBM868","IBM869",
				"IBM870","IBM871","IBM918","JIS_X0201","JIS_X0212-1990",
				"KOI8-R","KOI8-U","PDFDocEncoding","Shift_JIS","TIS-620",
				"US-ASCII","UTF-16","UTF-16BE","UTF-16LE","UTF-32","UTF-32BE",
				"UTF-32LE","UTF-8","windows-1250","windows-1251","windows-1252",
				"windows-1253","windows-1254","windows-1255","windows-1256",
				"windows-1257","windows-1258","windows-31j","x-Big5-HKSCS-2001",
				"x-Big5-Solaris","x-euc-jp-linux","x-EUC-TW","x-eucJP-Open","x-IBM1006",
				"x-IBM1025","x-IBM1046","x-IBM1097","x-IBM1098","x-IBM1112","x-IBM1122",
				"x-IBM1123","x-IBM1124","x-IBM1364","x-IBM1381","x-IBM1383","x-IBM300",
				"x-IBM33722","x-IBM737","x-IBM833","x-IBM834","x-IBM856","x-IBM874",
				"x-IBM875","x-IBM921","x-IBM922","x-IBM930","x-IBM933","x-IBM935",
				"x-IBM937","x-IBM939","x-IBM942","x-IBM942C","x-IBM943","x-IBM943C",
				"x-IBM948","x-IBM949","x-IBM949C","x-IBM950","x-IBM964","x-IBM970",
				"x-ISCII91","x-ISO-2022-CN-CNS","x-ISO-2022-CN-GB","x-iso-8859-11",
				"x-JIS0208","x-JISAutoDetect","x-Johab","x-MacArabic","x-MacCentralEurope",
				"x-MacCroatian","x-MacCyrillic","x-MacDingbat","x-MacGreek","x-MacHebrew",
				"x-MacIceland","x-MacRoman","x-MacRomania","x-MacSymbol","x-MacThai",
				"x-MacTurkish","x-MacUkraine","x-MS932_0213","x-MS950-HKSCS","x-MS950-HKSCS-XP",
				"x-mswin-936","x-PCK","x-SJIS_0213","x-UTF-16LE-BOM","X-UTF-32BE-BOM",
				"X-UTF-32LE-BOM","x-windows-50220","x-windows-50221","x-windows-874",
				"x-windows-949","x-windows-950","x-windows-iso2022jp"
              };
		for (int i=0;i<charsetall.length;i++) {
			if (charset.toLowerCase().contains(charsetall[i].toLowerCase())) {
				charreset = charsetall[i];
				break;
			}
		}
		return charreset;
	}
	
	   
	   /**
  * 对参数中的中文进行编码
  * 创建人: 刘小鹏 
  * 创建时间:  2016-4-14 下午2:46:50 
  * @version 1.0
  * @param uri
  * @return
  */
 public static String encodURI(String uri) {
     if (uri == null) {
         return null;
     }
     //只对中文参数进行转码
     if (uri.contains("?")&&!uri.endsWith("?")) {
         try {
             StringBuffer sb = new StringBuffer();
             sb.append("?");
             String[] array = uri.split("\\?");
             String uriPart = array[0];
             String paramStr = array[1];
             String[] params = paramStr.split("\\&");
             for (int i = 0 ;i <params.length;i++) {
                 if(i>0){
                     sb.append("&");
                 }
                 String param = params[i];
                 Integer indexFlag = param.indexOf("=");
                 if(indexFlag!=-1){
                     String name =  param.substring(0,indexFlag);
                     String value = param.substring(indexFlag+1);
                     value = URLEncoder.encode(value, "UTF-8");
                     value = value.replaceAll("%3D", "=");
                     sb.append(URLEncoder.encode(name, "UTF-8") + "=" +value);
                 }else{
                     sb.append(URLEncoder.encode(param, "UTF-8") );
                 }
               /*  String name = param.substring(0,param.indexOf("="));
                 String value = nameAndValue[1];
                 if (nameAndValue.length == 1) {
                     sb.append(URLEncoder.encode(param, "UTF-8"));
                 } else if(nameAndValue.length == 2) {
                     String name = nameAndValue[0];
                     String value = nameAndValue[1];
                     sb.append(URLEncoder.encode(name, "UTF-8") + "=" +URLEncoder.encode(value, "UTF-8"));
                 }else
                 {
                     String name = nameAndValue[0];
                     String value = nameAndValue[1];
                     sb.append(URLEncoder.encode(name, "UTF-8") + "=" +URLEncoder.encode(value, "UTF-8"));
                 }*/
                
             }
             uri = uriPart+sb.toString();
            /* Matcher matcher = Pattern.compile("[\\u4e00-\\u9fa5]").matcher(uri);
             while (matcher.find()) {
                 String tmp = matcher.group();
                 uri = uri.replaceAll(tmp, java.net.URLEncoder.encode(tmp, "UTF-8"));
             }*/
         } catch (UnsupportedEncodingException e) {
             // TODO Auto-generated catch block  
             e.printStackTrace();
         }
     }
     
     //对路径中的中文也进行转码
     String regex="([\u4e00-\u9fa5]+)"; 
     Matcher matcher = Pattern.compile(regex).matcher(uri);
     String find;
     String replace = null;
     while(matcher.find()){
         find  = matcher.group();
         try {
             replace = URLEncoder.encode(find, "UTF-8");
         } catch (UnsupportedEncodingException e1) {
             // TODO Auto-generated catch block
             e1.printStackTrace();
         }
         while(uri.contains(find)&&!find.equals(replace)){
                 uri = uri.replace(find, replace);
         }
     }
     
     
     
     uri = uri.replaceAll("/+$", "/");
     uri = uri.replaceAll(" ", "%20");
     return uri;
 }
 
	public static void main(String args[]) throws IOException {
		String url = "http://news.search.yahoo.co.jp/search?ei=UTF-8&p=新幹線";
System.out.println(encodURI(url));
	}

}