package com.zzsn.utility.util;

import java.io.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 
 * @author ear
 * ver: 2014.03.04
 */

public class PreProcessor {
	private final String tempFileName = "IEtemp.tmp";

	public PreProcessor() {
	}

	/****
	 * 读取 UTF-8 格式的文件
	 * 
	 * @param filename
	 *            (文件名称)
	 * @return
	 * @throws Exception
	 */
	public static String getContent_UTF8(String filename) throws Exception {
		// 获取某个文件的字节流
		FileInputStream fis = new FileInputStream(filename);
		// 按照 UTF-8 编码方式将字节流转化为字符流
		InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
		// 从字符流中获取文本并进行缓冲
		BufferedReader br = new BufferedReader(isr);
		// 声明并建立 StringBuffer 变量,用于存储全部文本文件内容
		StringBuffer sbContent = new StringBuffer();
		// 声明 String 变量,用于临时存储文本行内容
		String sLine;
		// 循环读取文本文件每行内容
		while ((sLine = br.readLine()) != null) {
			// 去掉回车和换行符,去掉文本行前后空格,连接全部文本文件内容
			// sbContent=sbContent.append(sLine.replace("\n","").replace("\r","").trim());
			String s = sLine.toString() + "\n";
			sbContent = sbContent.append(s);
		}
		// 输出文本文件内容
		return sbContent.toString();
	}

	public static String readFile(String srcFile) {

		String s = "";
		StringBuffer saveString = new StringBuffer();
		try {
			FileInputStream fis;
			String charset = Utility.detectCharSet(srcFile);
			fis = new FileInputStream(srcFile);
			InputStreamReader isr = new InputStreamReader(fis, charset); // 按照
																			// UTF-8
																			// 编码方式将字节流转化为字符流
			BufferedReader in = new BufferedReader(isr); // 从字符流中获取文本并进行缓冲

			while ((s = in.readLine()) != null) { // 循环读取文本文件每行内容
				saveString.append(s);
				saveString.append("\r\n");
			}
			in.close();
			isr.close();
			fis.close();

		} catch (IOException e) {
			// TODO Auto-generated catch block
			//e.printStackTrace();
		}
		return saveString.toString();
	}

	public static boolean writeFile(String destFile, String encoding,
			String content) {
		try {
			
			if((content == null) || (destFile == null) || (encoding == null))
			{
				System.err.println("Write file error: please input correct params");
				return false;
			}
			
			FileOutputStream fos = new FileOutputStream(destFile);
			OutputStreamWriter osw = new OutputStreamWriter(fos, encoding);
			osw.write(content); //?
			osw.flush();
			osw.close();
			fos.close();
			return true;
		} catch (Exception e) {
			e.printStackTrace();
			return false;
		}
	}

	public String convertFile2String(String fileName) {
		StringBuffer saveString = new StringBuffer();
		String s = new String();

		if (fileName.trim() == "") {
			fileName = tempFileName;
		}

		try {

			String charset = Utility.getFileEncoding(fileName); // 获得文件编码
																// //String
																// charset =
																// getWebEncoding(fileName);
			// System.out.println("Charset = " + charset);
			FileInputStream fis = new FileInputStream(fileName); // 获取某个文件的字节流
			InputStreamReader isr = new InputStreamReader(fis, charset); // 按照
																			// UTF-8
																			// 编码方式将字节流转化为字符流
			BufferedReader in = new BufferedReader(isr); // 从字符流中获取文本并进行缓冲
			while ((s = in.readLine()) != null) { // 循环读取文本文件每行内容
				saveString.append(s);
				saveString.append("\r\n");
			}
			in.close();
			isr.close();
			fis.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (NullPointerException e) {
			e.printStackTrace();
		}

		return saveString.toString();
	}

	public String convertFile2StringWithCorrect(String fileName) {
		StringBuffer saveString = new StringBuffer();
		String s = new String();

		if (fileName.trim() == "") {
			fileName = tempFileName;
		}

		try {
			String charset = Utility.getFileEncoding(fileName); // 获得文件编码
																// //String
																// charset =
																// getWebEncoding(fileName);
			FileInputStream fis = new FileInputStream(fileName); // 获取某个文件的字节流
			InputStreamReader isr = new InputStreamReader(fis, charset); // 按照
																			// UTF-8
																			// 编码方式将字节流转化为字符流
			BufferedReader in = new BufferedReader(isr); // 从字符流中获取文本并进行缓冲
			while ((s = in.readLine()) != null) { // 循环读取文本文件每行内容
				correctWebLine(saveString, s);
			}
			in.close();
			isr.close();
			fis.close();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (NullPointerException e) {
			e.printStackTrace();
		}

		String content = saveString.toString();
		content = correctWebContent(content);
		return content;
	}

	public void correctWebLine(StringBuffer saveString, String s) {
		if (s.matches("^[\\s&&[^\\n]]*$")) {// 如果是空行，跳过
			return;
		}

		int length = s.length();
		int prePos = 0;
		for (int i = 0; i < length; i++) {
			if (s.charAt(i) == '&') {
				String sub = s.substring(i + 1);
				if (sub.startsWith("gt;")) {
					saveString.append(s.substring(prePos, i));
					saveString.append(">");
					i += 3;
					prePos = i + 1;
				} else if (sub.startsWith("lt;")) {
					saveString.append(s.substring(prePos, i));
					saveString.append("<");
					i += 3;
					prePos = i + 1;
				} else if (sub.startsWith("nbsp;")) {
					saveString.append(s.substring(prePos, i));
					i += 5;
					prePos = i + 1;
				} else if (sub.startsWith("quot;")) {
					saveString.append(s.substring(prePos, i));
					saveString.append("\"");
					i += 5;
					prePos = i + 1;
				} else if (sub.startsWith("ldquo;")) {
					saveString.append(s.substring(prePos, i));
					saveString.append("“");
					i += 6;
					prePos = i + 1;
				} else if (sub.startsWith("rdquo;")) {
					saveString.append(s.substring(prePos, i));
					saveString.append("”");
					i += 6;
					prePos = i + 1;
				}
			}
		}
		if (prePos < length) {
			saveString.append(s.substring(prePos));
		}
		saveString.append("\r\n");
	}

	public String correctWebContent(String content) {
		try {
			Pattern p1 = Pattern.compile("(?s)<script\\s*.*?>(.*?)</script>",
					Pattern.CASE_INSENSITIVE);
			Matcher m1 = p1.matcher(content);
			content = m1.replaceAll("");

			Pattern p2 = Pattern.compile("(?s)<style\\s*.*?>(.*?)</style>",
					Pattern.CASE_INSENSITIVE);
			Matcher m2 = p2.matcher(content);
			content = m2.replaceAll("");

			Pattern p11 = Pattern.compile("(?s)<script\\s*.*?>(.*?)/>",
					Pattern.CASE_INSENSITIVE);
			Matcher m11 = p11.matcher(content);
			content = m11.replaceAll("");

			Pattern p21 = Pattern.compile("(?s)<style\\s*.*?>(.*?)/>",
					Pattern.CASE_INSENSITIVE);
			Matcher m21 = p21.matcher(content);
			content = m21.replaceAll("");

			// 去除注释
			// Pattern p3 = Pattern.compile("(?s)<!--\\s*.*?>(.*?)-->");
			Pattern p3 = Pattern.compile("<!--.*?-->");
			Matcher m3 = p3.matcher(content);
			content = m3.replaceAll("");

			Pattern p4 = Pattern.compile("</p>");
			Matcher m4 = p4.matcher(content);
			content = m4.replaceAll("");

			Pattern p5 = Pattern.compile("<p.*?>");
			Matcher m5 = p5.matcher(content);
			content = m5.replaceAll("");
		} catch (Exception e) {
			e.printStackTrace();
		}
		return content;
	}

	public String analyseTitle(String filePath) {

		// 读取网页内容
		String pageContent = this.convertFile2String(filePath);
		String regURL = "(?<=<title>).*?(?=</title>)";
		Pattern p = Pattern.compile(regURL, Pattern.CASE_INSENSITIVE
				| Pattern.MULTILINE | Pattern.DOTALL);
		Matcher m = p.matcher(pageContent);
		boolean bln = m.find();

		if (bln == true) {
			String title = m.group(0);
			return Utility.HTMLDecode(Utility.RemoveHTMLCode(Utility
					.RemoveReturnCode(title)));
		}

		return "";
	}

	public static void main(String args[]) {
		PreProcessor tool = new PreProcessor();
		tool
				.convertFile2StringWithCorrect("E:\\TrainNhzt\\cs\\test_new\\test_no/41.htm");
	}
}