package com.zzsn.util;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zzsn.entity.CatchWebByMetaSearch;
import com.zzsn.search.ChromeUtil;

import org.apache.http.ParseException;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;


public class RecorderUtil {

	// 提取百度新闻列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfBaidu(
			List<String> urlList, String charset, Long orgId, Long tid) {
		try {
			List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
			for (int i = 0; i < urlList.size(); i++) {
				URL url = new URL(urlList.get(i));
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString())
							.replaceAll("%2520", "+").replaceAll("%25", "%")
							.replaceAll("%20", "+");
//					uri_code ="http://news.baidu.com/ns?rn=50&ie=utf-8&ct=0&bs=%E2%80%9C%E5%9B%BD%E4%BD%8F%E4%BA%BA%E5%B1%85%E5%B7%A5%E7%A8%8B%E9%A1%BE%E9%97%AE%E2%80%9D&rsv_bp=1&sr=0&cl=2&f=3&prevct=no&tn=news&word=%E5%9B%BD%E4%BD%8F%E4%BA%BA%E5%B1%85%E5%B7%A5%E7%A8%8B%E9%A1%BE%E9%97%AE&rsv_sug3=3&rsv_sug4=711&rsv_sug1=1&rsp=0&inputT=3820&rsv_sug=1";
				} catch (URISyntaxException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				Connection conn = Jsoup.connect(uri_code);
				conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");

				Document doc = null;
				try {
					doc = conn.timeout(10000).get();
				} catch (Exception ex) {
					// ex.printStackTrace();
					System.out.println("百度搜索中该关键词搜索没有相关新闻！");
					continue;
				}
				System.out.println("----百度搜索----" + uri);
				Elements firstElementsLink = doc.select("div.result");
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				for (int m=0;m<firstElementsLink.size();m++) {
					catchWebByMetaSearch = new CatchWebByMetaSearch();
					Elements orainAndDate = firstElementsLink.get(m).select("p.c-author");
					
					if (orainAndDate.size()>0) {
						String orainAndDatestr = orainAndDate.text();
						//发布时间
						String publishDate = DateUtil.getPublishDate(orainAndDatestr);
						catchWebByMetaSearch.setPublishDate(publishDate);
						
						//来源
						String orin = orainAndDatestr.split(" ")[0].trim();
						catchWebByMetaSearch.setSourcesite(orin);
					}


					Elements titleAndUrl = firstElementsLink.get(m).select("a[data-click]");
					
					if (titleAndUrl.size()>0) {
						//标题
						String title = titleAndUrl.get(0).text().trim();
						catchWebByMetaSearch.setTitle(title);
						//源网址
						String addressurl= titleAndUrl.attr("href");
						catchWebByMetaSearch.setSourceaddress(addressurl);
					}

					
					catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setTid(tid);
					metaSearchList.add(catchWebByMetaSearch);
				}
				catchWebByMetaSearchList.addAll(metaSearchList);
			}
			return catchWebByMetaSearchList;

		} catch (ParseException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}

	// 提取百度首页搜索列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfBaiduHomeSearch(
			List<String> urlList, String charset, Long orgId, Long tid) {
		try {
			List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
			for (int i = 0; i < urlList.size(); i++) {
				URL url = new URL(urlList.get(i));
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString())
							.replaceAll("%2520", "+").replaceAll("%25", "%")
							.replaceAll("%20", "+");
//					uri_code ="http://news.baidu.com/ns?rn=50&ie=utf-8&ct=0&bs=%E2%80%9C%E5%9B%BD%E4%BD%8F%E4%BA%BA%E5%B1%85%E5%B7%A5%E7%A8%8B%E9%A1%BE%E9%97%AE%E2%80%9D&rsv_bp=1&sr=0&cl=2&f=3&prevct=no&tn=news&word=%E5%9B%BD%E4%BD%8F%E4%BA%BA%E5%B1%85%E5%B7%A5%E7%A8%8B%E9%A1%BE%E9%97%AE&rsv_sug3=3&rsv_sug4=711&rsv_sug1=1&rsp=0&inputT=3820&rsv_sug=1";
				} catch (URISyntaxException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				Connection conn = Jsoup.connect(uri_code);
				conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");

				Document doc = null;
				try {
					doc = conn.timeout(10000).get();
				} catch (Exception ex) {
					// ex.printStackTrace();
					System.out.println("百度首页搜索中该关键词搜索没有相关新闻！");
					continue;
				}
				System.out.println("----百度首页搜索----" + uri);
				Elements firstElementsLink = doc.select("div.result");
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				for (int m=0;m<firstElementsLink.size();m++) {
					catchWebByMetaSearch = new CatchWebByMetaSearch();
					
					Elements titleAndUrl = firstElementsLink.get(m).select("a[data-click]");
					
					if (titleAndUrl.size()>0) {
						//标题
						String title = titleAndUrl.get(0).text().trim();
						if(null != title && title.trim().length() > 0){
							title = title.replaceAll("</?[^>]+>", "");
						}else{
							continue;
						}
						if(null != title && title.trim().length() > 0){
							catchWebByMetaSearch.setTitle(title);
						}else{
							continue;
						}
						//源网址
						String addressurl= titleAndUrl.attr("href");
						if(null != addressurl && addressurl.trim().length() > 0){
							addressurl = getRealUrlFromBaiduUrl(addressurl);
						}else{
							continue;
						}
						if(null != addressurl && addressurl.trim().length() > 0){
							catchWebByMetaSearch.setSourceaddress(addressurl);
						}else{
							continue;
						}
					}

					
					catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setTid(tid);
					metaSearchList.add(catchWebByMetaSearch);
				}
				catchWebByMetaSearchList.addAll(metaSearchList);
			}
			return catchWebByMetaSearchList;

		} catch (ParseException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}
	
    public static String getRealUrlFromBaiduUrl(String url) {
        Connection.Response res = null;
        int itimeout = 60000;
        try {
            res = Jsoup.connect(url).timeout(itimeout).method(Connection.Method.GET).followRedirects(false).execute();
            return res.header("Location");
        } catch (IOException e) {
            e.printStackTrace();
        }
        return "";
    }
	
	// 提取搜狗新闻列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfSougou(
			List<String> urlList, String charset, Long orgId, Long tid) {
		try {
			List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
			for (int i = 0; i < urlList.size(); i++) {
				URL url = new URL(urlList.get(i));
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString())
							.replaceAll("%2520", "+").replaceAll("%25", "%")
							.replaceAll("%20", "+");
				} catch (URISyntaxException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}

				Connection conn = Jsoup.connect(uri_code);
				conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50");

				Document doc = null;
				try {
					doc = conn.timeout(10000).get();
				} catch (Exception ex) {
					// ex.printStackTrace();
					System.out.println("搜狗搜索中该关键词搜索没有相关新闻！");
					continue;
				}
				System.out.println("----搜狗搜索----" + uri);
				Elements firstElementsLink = doc.select("div.vrwrap");
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				for (int m=0;m<firstElementsLink.size();m++) {
					catchWebByMetaSearch = new CatchWebByMetaSearch();
					Elements orainAndDate = firstElementsLink.get(m).select("p.news-from");
					
					if (orainAndDate.size()>0) {
						String orainAndDatestr = orainAndDate.text();
						//发布时间
						String publishDate = DateUtil.getPublishDate(orainAndDatestr);
						catchWebByMetaSearch.setPublishDate(publishDate);
						
						//来源
						String orin = orainAndDatestr.split(" ")[0].trim();
						catchWebByMetaSearch.setSourcesite(orin);
					}


					Elements titleAndUrl = firstElementsLink.get(m).select("a");
					if (titleAndUrl.size()>0) {
						//标题
						String title = titleAndUrl.get(0).text().trim();
						catchWebByMetaSearch.setTitle(title);
						//源网址
						String addressurl= titleAndUrl.get(0).attr("href");
						catchWebByMetaSearch.setSourceaddress(addressurl);
					}

					
					catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setTid(tid);
					metaSearchList.add(catchWebByMetaSearch);
				}
				catchWebByMetaSearchList.addAll(metaSearchList);
				
			}
			return catchWebByMetaSearchList;

		} catch (ParseException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}

	// 提取好搜新闻列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfHaosou(
			List<String> urlList, String charset, Long orgId, Long tid) {
		try {
			List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
			for (int i = 0; i < urlList.size(); i++) {
				URL url = new URL(urlList.get(i));
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString())
							.replaceAll("%2520", "+").replaceAll("%25", "%")
							.replaceAll("%20", "+");
				} catch (URISyntaxException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}

				Connection conn = Jsoup.connect(uri_code);
				conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");

				Document doc = null;
				try {
					doc = conn.timeout(10000).get();
				} catch (Exception ex) {
					// ex.printStackTrace();
					System.out.println("好搜搜索中该关键词搜索没有相关新闻！");
					continue;
				}
				System.out.println("----好搜搜索----" + uri);
				
				Elements firstElementsLink = doc.select("li[data-from=\"news\"]");
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				for (int m=0;m<firstElementsLink.size();m++) {
					catchWebByMetaSearch = new CatchWebByMetaSearch();
					Elements orain = firstElementsLink.get(m).select("span.sitename");
					if (orain.size()>0) {
						//来源
						String orin = orain.text();
						catchWebByMetaSearch.setSourcesite(orin);

					}
					
					Elements pubDate = firstElementsLink.get(m).select("span.posttime");
					if (pubDate.size()>0) {
						//发布时间
						String orinpublishDate = pubDate.attr("title");
						String publishDate = DateUtil.getPublishDate(orinpublishDate);
						catchWebByMetaSearch.setPublishDate(publishDate);

					}

					Elements titleAndUrl = firstElementsLink.get(m).select("a");
					if (titleAndUrl.size()>0) {
						//标题
						String title = titleAndUrl.text().trim();
						catchWebByMetaSearch.setTitle(title);
						//源网址
						String addressurl= titleAndUrl.attr("href");
						catchWebByMetaSearch.setSourceaddress(addressurl);
					}

					catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setTid(tid);
					metaSearchList.add(catchWebByMetaSearch);
				}
				catchWebByMetaSearchList.addAll(metaSearchList);

			}
			return catchWebByMetaSearchList;

		} catch (ParseException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}

	// 提取中国搜索新闻列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfChinaso(
			List<String> urlList, String charset, Long orgId, Long tid) {
		try {
			List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
			for (int i = 0; i < urlList.size(); i++) {
				URL url = new URL(urlList.get(i));
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString())
							.replaceAll("%2520", "+").replaceAll("%25", "%")
							.replaceAll("%20", "+");
				} catch (URISyntaxException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				System.out.println("----中国搜索----" + uri);
				
				Connection conn = Jsoup.connect(uri_code);
				conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");

				Document doc = null;
				try {
					doc = conn.timeout(10000).get();
				} catch (Exception ex) {
					// ex.printStackTrace();
					System.out.println("中国搜索中该关键词搜索没有相关新闻！");
					continue;
				}
				
				
				Elements firstElementsLink = doc.select("li[class=\"reItem \"]");
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				for (int m=0;m<firstElementsLink.size();m++) {
					catchWebByMetaSearch = new CatchWebByMetaSearch();
					Elements orainAndDate = firstElementsLink.get(m).select("p[class=\"snapshot\"]");
					
					if (orainAndDate.size()>0) {
						String orainAndDatestr = orainAndDate.text();
						//发布时间
						String publishDate = DateUtil.getPublishDate(orainAndDatestr);
						catchWebByMetaSearch.setPublishDate(publishDate);
						
						//来源
						String orin = orainAndDatestr.replaceAll("\\d", "").replaceAll("-", "").trim();
						catchWebByMetaSearch.setSourcesite(orin);
					}

					Elements titleAndUrlh2 = firstElementsLink.get(m).select("h2");
					if (titleAndUrlh2.size()>0) {
						Elements titleAndUrl = firstElementsLink.get(m).select("a");
						if (titleAndUrl.size()>0) {
							//标题
							String title = titleAndUrl.get(0).text().trim();
							catchWebByMetaSearch.setTitle(title);
							//源网址
							String addressurl= titleAndUrl.attr("href");
							catchWebByMetaSearch.setSourceaddress(addressurl);
						}

					}

					catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setTid(tid);
					metaSearchList.add(catchWebByMetaSearch);
				}
				catchWebByMetaSearchList.addAll(metaSearchList);
				

			}
			return catchWebByMetaSearchList;

		} catch (ParseException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}

	// 提取雅虎搜索新闻列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfYahoo(
			List<String> urlList, String charset, Long orgId, Long tid) {
		List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
		try {
			for (int i = 0; i < urlList.size(); i++) {
				URL url = new URL(urlList.get(i));
//				URL url = new URL("https://news.search.yahoo.com/search?p=ETC");
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString()).replaceAll(
							"%2520", "+");
					// uri_code =
					// "http://news.search.yahoo.co.jp/search?ei=UTF-8&p=%E6%96%B0%E5%B9%B9%E7%B7%9A";
				} catch (URISyntaxException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				System.out.println("----雅虎搜索----" + uri);
				Connection conn = Jsoup.connect(uri_code);
				conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");
				Document doc;
				try {
					doc = conn.timeout(10000).get();
				} catch (Exception ex) {
					System.out.println("雅虎搜索中该关键词搜索没有相关新闻！");
					continue;
				}

				Elements ElementsLink = doc.select("ol[class=\" reg searchCenterMiddle\"]");
				if (ElementsLink.size()>0) {
					Elements firstElementsLink = ElementsLink.select("li");
					List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
					CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
					for (int m=0;m<firstElementsLink.size();m++) {
						catchWebByMetaSearch = new CatchWebByMetaSearch();
						Elements orain = firstElementsLink.get(m).select("span.cite");
						if (orain.size()>0) {
							//来源
							String orin = orain.text();
							catchWebByMetaSearch.setSourcesite(orin);

						}
//						
//						Elements pubDate = firstElementsLink.get(m).select("span.posttime");
//						if (pubDate.size()>0) {
//							//发布时间
//							String orinpublishDate = pubDate.attr("title");
//							String publishDate = DateUtil.getPublishDate(orinpublishDate);
//							catchWebByMetaSearch.setPublishDate(publishDate);
	//
//						}

						Elements titleAndUrl = firstElementsLink.get(m).select("div.compTitle");
						if (titleAndUrl.size()>0) {
							Elements titleAndUrl1 = titleAndUrl.select("a");
							if (titleAndUrl1.size()>0) {
								//标题
								String title = titleAndUrl1.get(0).text().trim();
								catchWebByMetaSearch.setTitle(title);
								//源网址
								String addressurl= titleAndUrl1.attr("href");
								catchWebByMetaSearch.setSourceaddress(addressurl);
							}

						}

						catchWebByMetaSearch.setOrgId(orgId);
						catchWebByMetaSearch.setTid(tid);
						metaSearchList.add(catchWebByMetaSearch);
					}
					catchWebByMetaSearchList.addAll(metaSearchList);
				}

				

			}
			return catchWebByMetaSearchList;

		} catch (Exception e) {
			// e.printStackTrace();
			return catchWebByMetaSearchList;
		}
		// return null;
	}

	// 提取雅虎搜索web列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfYahooWeb(
			List<String> urlList, String charset, Long orgId, Long tid) {
		List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
		try {
			for (int i = 0; i < urlList.size(); i++) {
				URL url = new URL(urlList.get(i));
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString()).replaceAll(
							"%2520", "+");
					// uri_code =
					// "http://news.search.yahoo.co.jp/search?ei=UTF-8&p=%E6%96%B0%E5%B9%B9%E7%B7%9A";
				} catch (URISyntaxException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				System.out.println("----雅虎Web搜索----" + uri);
				Document doc = null;
				try {
					doc = Jsoup
							.connect(uri_code)
							.userAgent(
									"Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
							.ignoreContentType(true).timeout(10000).get();
				} catch (Exception ex) {
					// ex.printStackTrace();
					System.out.println("雅虎Web搜索中该关键词搜索没有相关新闻！");
					continue;
				}

				String info = doc.toString();
				// System.out.println(info);

				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				Pattern pattern = Pattern
						.compile("<div class=\"w\"[\\s\\S]+?(?=./div>)+[\\s\\S]+?(?=./div>)+[\\s\\S]+?(?=./div>)");
				Matcher mat = pattern.matcher(info);
				while (mat.find()) {
					catchWebByMetaSearch = new CatchWebByMetaSearch();
					Pattern pattern_title = Pattern
							.compile("h3>[\\s\\S]+?(?=./h3>)");
					Matcher match_url_title = pattern_title
							.matcher(mat.group());
					if (match_url_title.find()) {
						// url匹配
						Matcher match_url = Pattern
								.compile("href=\"http[^\"]+").matcher(
										match_url_title.group());
						// title匹配
						Matcher match_title = Pattern.compile(
								"\">[\\s\\S]+?(?=./a>)").matcher(
								match_url_title.group());

						if (match_url.find()) {
							if (match_url.group().length() > 6) {
								catchWebByMetaSearch.setSourceaddress(match_url
										.group().substring(6));
							}
						}
						if (match_title.find()) {
							// 标题<strong>
							String title = "";
							if (match_title.group().length() > 2) {
								title = match_title.group().substring(2)
										.replaceAll("<b>", "")
										.replaceAll("</b>", "")
										.replaceAll("<strong>", "")
										.replaceAll("</strong>", "")
										.replaceAll("\"", "");
								catchWebByMetaSearch.setTitle(title);
							}
						}
					}

					// 摘要匹配
					Matcher match_summary = Pattern.compile(
							"p>[\\s\\S]+?(?=./p>)").matcher(mat.group());

					if (match_summary.find()) {
						// 摘要
						String summary = "";
						if (match_summary.group().length() > 2) {
							summary = match_summary.group().substring(2)
									.replaceAll("<b>", "")
									.replaceAll("</b>", "")
									.replaceAll("<strong>", "")
									.replaceAll("</strong>", "")
									.replaceAll("\"", "");
							catchWebByMetaSearch.setSummary(summary);
						}
					}
					catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setTid(tid);
					metaSearchList.add(catchWebByMetaSearch);
				}
				catchWebByMetaSearchList.addAll(metaSearchList);
			}
			return catchWebByMetaSearchList;

		} catch (Exception e) {
			// e.printStackTrace();
			return catchWebByMetaSearchList;
		}
		// return null;
	}
	


	// 提取今日头条搜索web列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfToutiao(
			List<String> urlList, String charset, Long orgId, Long tid) {
		List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
		try {
			for (int i = 0; i < urlList.size(); i++) {
//				URL url = new URL(urlList.get(i));
				URL url = new URL("https://www.toutiao.com/search_content/?format=json&autoload=true&count=20&cur_tab=1&from=search_tab&keyword=中国");
				
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString()).replaceAll(
							"%2520", "+");
					// uri_code =
				} catch (URISyntaxException e) {
					e.printStackTrace();
				}
				System.out.println("----今日头条搜索----" + uri);
				JSONObject jsonObj = null;
				try {
					jsonObj = RequestUtil.httpRequest(uri_code, "GET", null);
				} catch (Exception ex) {
					System.out.println("今日头条搜索中该关键词搜索没有相关新闻！");
					continue;
				}
//				System.out.println(jsonObj.get("tokens"));
//				System.out.println(jsonObj.get("request_id"));
				
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				
				JSONArray jsonArr = jsonObj.getJSONArray("data");
				if(null != jsonArr && jsonArr.size() > 0){
					for(int j=0; j< jsonArr.size(); j++){
						try {
							JSONObject jsonObjT = jsonArr.getJSONObject(j);
							catchWebByMetaSearch = new CatchWebByMetaSearch();
							catchWebByMetaSearch.setSourceaddress(jsonObjT.getString("article_url"));
							catchWebByMetaSearch.setTitle(jsonObjT.getString("title"));
							catchWebByMetaSearch.setSummary(jsonObjT.getString("abstract"));
							catchWebByMetaSearch.setPublishDate(jsonObjT.getString("datetime"));
							catchWebByMetaSearch.setSourcesite(jsonObjT.getString("source"));
							catchWebByMetaSearch.setOrgId(orgId);
							catchWebByMetaSearch.setTid(tid);
							metaSearchList.add(catchWebByMetaSearch);
						} catch (Exception e) {
//							e.printStackTrace();
							continue;
						}
					}
					catchWebByMetaSearchList.addAll(metaSearchList);
				}
			}
			return catchWebByMetaSearchList;

		} catch (Exception e) {
//			e.printStackTrace();
			return catchWebByMetaSearchList;
		}
		// return null;
	}

	// 提取新浪搜索web列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfSina(
			List<String> urlList, String charset, Long orgId, Long tid) {
		List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
		try {
			for (int i = 0; i < urlList.size(); i++) {
				URL url = new URL(urlList.get(i));
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString()).replaceAll(
							"%2520", "+");
					// uri_code =
				} catch (URISyntaxException e) {
					e.printStackTrace();
				}
				System.out.println("----新浪搜索----" + uri);
				JSONObject jsonObj = null;
				try {
					String result  = RequestUtil.httpGetRequest(uri_code );
					result = result.substring(5);
					result = result.substring(0, result.length() -2);
					System.out.println(result);
					jsonObj = JSONObject.parseObject(result);
				} catch (Exception ex) {
					System.out.println("新浪搜索中该关键词搜索没有相关新闻！");
					continue;
				}
				
				JSONObject result = jsonObj.getJSONObject("result");
				
				System.out.println("q="+result.get("q"));
				System.out.println("page="+result.get("page"));
				
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				
				JSONArray jsonArr = result.getJSONArray("list");
				if(null != jsonArr && jsonArr.size() > 0){
					for(int j=0; j< jsonArr.size(); j++){
						try {
							JSONObject jsonObjT = jsonArr.getJSONObject(j);
							catchWebByMetaSearch = new CatchWebByMetaSearch();
							catchWebByMetaSearch.setSourceaddress(jsonObjT.getString("url"));
							catchWebByMetaSearch.setTitle(jsonObjT.getString("origin_title"));
							catchWebByMetaSearch.setSummary(jsonObjT.getString("intro"));
							catchWebByMetaSearch.setPublishDate(jsonObjT.getString("datetime"));
							catchWebByMetaSearch.setSourcesite(jsonObjT.getString("media"));
							catchWebByMetaSearch.setOrgId(orgId);
							catchWebByMetaSearch.setTid(tid);
							metaSearchList.add(catchWebByMetaSearch);
						} catch (Exception e) {
//							e.printStackTrace();
							continue;
						}
					}
					catchWebByMetaSearchList.addAll(metaSearchList);
				}
			}
			return catchWebByMetaSearchList;

		} catch (Exception e) {
//			e.printStackTrace();
			return catchWebByMetaSearchList;
		}
		// return null;
	}


	// 提取凤凰新闻列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfIFeng(
			List<String> urlList, String charset, Long orgId, Long tid) {
		try {
			List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
			for (int i = 0; i < urlList.size(); i++) {
				URL url = new URL(urlList.get(i));
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString())
							.replaceAll("%2520", "+").replaceAll("%25", "%")
							.replaceAll("%20", "+");
				} catch (URISyntaxException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				// HttpClient httpClient = new DefaultHttpClient();
				// httpClient.getParams().setIntParameter("http.socket.timeout",
				// 60000);
				// System.out.println("----搜狗搜索----"+uri);
				// HttpGet httpget;
				// try{
				// httpget = new HttpGet(uri);// Get请求
				// } catch (Exception ex) {
				// System.out.println("搜狗中该关键词搜索没有相关新闻！");
				// continue;
				// }
				//
				// HttpResponse httpresponse = httpClient.execute(httpget); //
				// 发送请求
				// HttpEntity entity = httpresponse.getEntity();// 获取返回数据
				// String chars = EntityUtils.getContentCharSet(entity);
				// if(chars != null){
				// charset = chars;
				// }
				// String info =EntityUtils.toString(entity, charset);

				Document doc = null;
				try {
					doc = Jsoup
							.connect(uri_code)
							.userAgent(
									"Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")
							.ignoreContentType(true).timeout(10000).get();
				} catch (Exception ex) {
					// ex.printStackTrace();
					System.out.println("凤凰搜索中该关键词搜索没有相关新闻！");
					continue;
				}
				Elements firstElementsLink = doc.select("a[data-click]");
				System.out.println("----凤凰搜索----" + uri);
				String info = doc.toString();

				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				Pattern pattern = Pattern
						.compile("<div class=\"searchResults\"[\\s\\S]+?(?=./div>)");
				Matcher mat = pattern.matcher(info);
				while (mat.find()) {
					catchWebByMetaSearch = new CatchWebByMetaSearch();
					// url匹配
					Matcher match_url = Pattern.compile("a href=\"http:[^\"]+")
							.matcher(mat.group());
					// title匹配
					Matcher match_title = Pattern.compile(
							"blank\">[\\s\\S]+?(?=./a>)").matcher(mat.group());
					// 摘要匹配
					Matcher match_summary = Pattern.compile(
							"p>原标题：[\\s\\S]+?(?=./p>)").matcher(
							mat.group());
					// 来源匹配
					Matcher match_souse = Pattern.compile(
							"p><font[\\s\\S]+?(?=./font>)").matcher(
							mat.group());
					// 发布时间匹配
					Matcher match_publishdate = Pattern.compile(
							"凤凰资讯[\\s\\S]+?(?=.</font>)").matcher(
							mat.group());

					if (match_url.find()) {
						// url
						if (match_url.group().length() > 8) {
							catchWebByMetaSearch.setSourceaddress(match_url
									.group().substring(8));
						}
					}
					if (match_title.find()) {
						// 标题
						String title = "";
						if (match_title.group().length() > 7) {
							title = match_title.group().substring(7)
									.replaceAll("<font color=\"red\">", "")
									.replaceAll("</font>", "")
									.replaceAll("\\n", "");
							catchWebByMetaSearch.setTitle(title);
						}
					}
					if (match_summary.find()) {
						// 摘要
						String summary = "";
						if (match_summary.group().length() > 6) {
							summary = match_summary.group().substring(6)
									.replaceAll("</p>", "")
									.replaceAll("\\n", "");
							catchWebByMetaSearch.setSummary(summary);
						}
					}
					// 来源
					catchWebByMetaSearch.setSourcesite("凤凰资讯");
					if (match_publishdate.find()) {
						// 发布时间
						String dat = DateUtil.getPublishDate(match_publishdate
								.group());
						if (dat != null && dat.length() > 0) {
							catchWebByMetaSearch.setPublishDate(dat);
						}
					}
//					System.out.println(catchWebByMetaSearch.getTitle());
//					System.out.println(catchWebByMetaSearch.getSourceaddress());
//					System.out.println(catchWebByMetaSearch.getSummary());
//					System.out.println(catchWebByMetaSearch.getPublishDate());
//					System.out.println(catchWebByMetaSearch.getSourcesite());
					
					catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setTid(tid);
					metaSearchList.add(catchWebByMetaSearch);
				}
				catchWebByMetaSearchList.addAll(metaSearchList);
			}
			return catchWebByMetaSearchList;

		} catch (ParseException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfGoogle(
			List<String> urlList, String charset, Long orgId, Long tid) {
		try {
			List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
			for (int i = 0; i < urlList.size(); i++) {
				Thread.sleep(1000*30);
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				Document doc = null;
				String docstr= ChromeUtil.getChromeDoc(urlList.get(i));
				if(docstr==null){
					continue;
				}
				doc=Jsoup.parse(docstr);
				Elements firstElementsLink = doc.select("div[class=g]");
				//若果没有结果则不循环
				if(firstElementsLink.size()==0){
					break;
				}
				String info = doc.toString();
				for (int j = 0; j < firstElementsLink.size(); j++) {
					catchWebByMetaSearch= new CatchWebByMetaSearch();
					Elements e=firstElementsLink.get(j).select("h3");
					Elements a=firstElementsLink.get(j).select("a");
					Elements timespan=firstElementsLink.select("span[class=f nsa fwzPFf]");
					System.out.println(e.get(0).text());
					System.out.println(a.get(0).attr("href"));
					catchWebByMetaSearch.setTid(tid);
					catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
					catchWebByMetaSearch.setTitle(e.get(0).text());
					if(timespan.size()>0){
						
					catchWebByMetaSearch.setPublishDate(timespan.get(0).text());
					}
					metaSearchList.add(catchWebByMetaSearch);
				}
				catchWebByMetaSearchList.addAll(metaSearchList);
			}
			return catchWebByMetaSearchList;

		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
}
