package com.zzsn.search.souGouCrawler;


import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.search.util.GetCookies;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch;
import com.zzsn.utility.util.ChromeUtil;
import com.zzsn.utility.util.DateUtil;
import com.zzsn.utility.util.Utility;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.kafka.core.KafkaTemplate;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Slf4j
public class SouGouRecorderUtil {

	// 提取搜狗新闻列表URL
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> catchWebOfSougouList(
			List<String> urlList, String charset, Long orgId, Long tid, KafkaTemplate kafkaTemplate) {
		try {
			List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
			Map<String, String> cookie2 = GetCookies.getCookie2(urlList.get(0));
			String SUID=cookie2.get("SUID");
			String SNUID=cookie2.get("SNUID");
			if(StringUtils.isEmpty(SNUID)){
				SUID="";
				SNUID="";
			}
			for (int i = 0; i < urlList.size(); i++) {
				URL url = new URL(urlList.get(i));
				URI uri = null;
				String uri_code = "";
				try {
					uri = new URI(url.getProtocol(), url.getHost(),
							url.getPath(), url.getQuery(), null);
					uri_code = Utility.encodURI(uri.toString())
							.replaceAll("%2520", "+").replaceAll("%25", "%")
							.replaceAll("%20", "+");
				} catch (URISyntaxException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
				Connection conn = Jsoup.connect(uri_code);
				conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50");
				conn.cookie("browerV","3");
				conn.cookie("osV","1");
				conn.cookie("sct","3");
				conn.cookie("sst0","753");
				conn.cookie("SUV","005E5558458CAE4B5B248FD6FBCA1033");
				conn.cookie("SUID",SUID);
				conn.cookie("SNUID",SNUID);
				Document doc = null;
				try {
					doc = conn.timeout(10000).get();
					Thread.sleep(5000);
				} catch (Exception ex) {
					// ex.printStackTrace();
					System.out.println("搜狗搜索中该关键词搜索没有相关新闻！");
					continue;
				}
				System.out.println("----搜狗搜索----" + uri);
				Elements firstElementsLink = doc.select("div.vrwrap");
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				for (int m=0;m<firstElementsLink.size();m++) {
					catchWebByMetaSearch = new CatchWebByMetaSearch();
					Elements orainAndDate = firstElementsLink.get(m).select("p.news-from");

					if (orainAndDate.size()>0) {
						String orainAndDatestr = orainAndDate.select("span:eq(1)").text();
						//发布时间
						String publishDate = DateUtil.getPublishDate(orainAndDatestr);
						catchWebByMetaSearch.setPublishDate(publishDate);

						//来源
//						String orin = orainAndDatestr.split(" ")[0].trim();
						String orin = orainAndDate.select("span:eq(0)").text();;
						catchWebByMetaSearch.setSourcesite(orin);
					}


					Elements titleAndUrl = firstElementsLink.get(m).select("a");
					if (titleAndUrl.size()>0) {
						//标题
						String title = titleAndUrl.get(0).text().trim();
						catchWebByMetaSearch.setTitle(title);
						//源网址
						Element element = titleAndUrl.get(0);
						element.setBaseUri(uri_code);
						String addressurl = titleAndUrl.get(0).absUrl("href");
						String s = sendGet(addressurl);
						Pattern pattern = Pattern.compile("URL=\\'(.{1,})\\'");
						Matcher matcher = pattern.matcher(s);
						String realUrl="";
						while(matcher.find()){
							realUrl=matcher.group(1);
						}
						if(StringUtils.isEmpty(realUrl)){
							realUrl=addressurl;
						}
						catchWebByMetaSearch.setSourceaddress(realUrl);

					}
					catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setTid(tid);
					metaSearchList.add(catchWebByMetaSearch);
				}
				for (CatchWebByMetaSearch catchMetaSearch:metaSearchList){
					ObjectMapper mapper = new ObjectMapper();
					try {
						String docjson = mapper.writeValueAsString(catchMetaSearch);
                        kafkaTemplate.send(Constants.KAFKA_PRODUCT_GOOGLE_URLLIST_TOPIC, "key", docjson);
						log.info("发送到kafka成功。");
					}catch (Exception e){
						log.info(e.getMessage());
					}
				}
				catchWebByMetaSearchList.addAll(metaSearchList);

			}
			return catchWebByMetaSearchList;

		}  catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}

	public static String sendGet(String url) {
		String result = "";
		String urlName = url;
		try {
			URL realURL = new URL(urlName);
			URLConnection conn = realURL.openConnection();
			conn.setRequestProperty("accept", "*/*");
			conn.setRequestProperty("connection", "Keep-Alive");
			conn.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36");
			conn.connect();
			Map<String, List<String>> map = conn.getHeaderFields();
			for (String s : map.keySet()) {
				System.out.println(s + "-->" + map.get(s));
			}
			BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
			String line;
			while ((line = in.readLine()) != null) {
				result += "\n" + line;
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return result;
	}

	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> catchWebOfGoogleList(
			List<String> urlList, String charset, Long orgId, Long tid) {
		try {
			List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
			for (int i = 0; i < urlList.size(); i++) {
				Thread.sleep(1000*2);
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				Document doc = null;
				System.out.println(urlList.get(i));
//				String docstr=ChromeUtil.getChromeDocnews(urlList.get(i),((i)*20)+"");
				String docstr=ChromeUtil.getChromeDoc(urlList.get(i));
				if(docstr==null){
					continue;
				}
				doc=Jsoup.parse(docstr);
				Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
				//若果没有结果则不循环
				if(firstElementsLink.size()==0){
					break;
				}
				String info = doc.toString();
				for (int j = 0; j < firstElementsLink.size(); j++) {
					catchWebByMetaSearch= new CatchWebByMetaSearch();
//					System.out.println(firstElementsLink.get(j).toString());
					//标题
//					Elements e=firstElementsLink.get(j).select("div[class=mCBkyc tNxQIb y355M JIFdL JQe2Ld nDgy9d]");
					Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]");
					//链接
					Elements a=firstElementsLink.get(j).select("a");
					//Elements timespan=firstElementsLink.get(j).select("span[class=WG9SHc]");
					System.out.println(e.get(0).text());
					System.out.println(a.get(0).attr("href"));
					catchWebByMetaSearch.setTid(tid);
					catchWebByMetaSearch.setSummary(urlList.get(i));
					//catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
					catchWebByMetaSearch.setTitle(e.get(0).text());
					//来源
					String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
					catchWebByMetaSearch.setSourcesite(origin);
					metaSearchList.add(catchWebByMetaSearch);
				}
				catchWebByMetaSearchList.addAll(metaSearchList);
			}
			return catchWebByMetaSearchList;

		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
	@SuppressWarnings("deprecation")
	public static List<CatchWebByMetaSearch> CatchWebOfGoogle1(
			List<String> urlList, String charset, Long orgId, Long tid) {
		try {
			List<CatchWebByMetaSearch> catchWebByMetaSearchList = new ArrayList<CatchWebByMetaSearch>();
			for (int i = 0; i < urlList.size(); i++) {
				Thread.sleep(1000*5);
				CatchWebByMetaSearch catchWebByMetaSearch = new CatchWebByMetaSearch();
				List<CatchWebByMetaSearch> metaSearchList = new ArrayList<CatchWebByMetaSearch>();
				Document doc = null;
				System.out.println(urlList.get(i));
				String docstr=ChromeUtil.getChromeDocnews(urlList.get(i),((i)*20+0)+"");
				if(docstr==null){
					continue;
				}
				doc=Jsoup.parse(docstr);
				Elements firstElementsLink = doc.select("g-card[class=ftSUBd]");
				//若果没有结果则不循环
				if(firstElementsLink.size()==0){
					break;
				}
				String info = doc.toString();
				for (int j = 0; j < firstElementsLink.size(); j++) {
					catchWebByMetaSearch= new CatchWebByMetaSearch();
//					System.out.println(firstElementsLink.get(j).toString());
					//标题
//					Elements e=firstElementsLink.get(j).select("div[class=mCBkyc tNxQIb y355M JIFdL JQe2Ld nDgy9d]");
					Elements e=firstElementsLink.get(j).select("div[class=\"mCBkyc y355M JQe2Ld nDgy9d\"]");
					//链接
					Elements a=firstElementsLink.get(j).select("a");
					//Elements timespan=firstElementsLink.get(j).select("span[class=WG9SHc]");
					System.out.println(e.get(0).text());
					System.out.println(a.get(0).attr("href"));
					catchWebByMetaSearch.setTid(tid);
					catchWebByMetaSearch.setSummary(urlList.get(i));
					//catchWebByMetaSearch.setOrgId(orgId);
					catchWebByMetaSearch.setSourceaddress(a.get(0).attr("href"));
					catchWebByMetaSearch.setTitle(e.get(0).text());
					//来源
					String origin=firstElementsLink.get(j).select("div[class=\"CEMjEf NUnG9d\"]").text();
					catchWebByMetaSearch.setSourcesite(origin);
					metaSearchList.add(catchWebByMetaSearch);
				}
				catchWebByMetaSearchList.addAll(metaSearchList);
			}
			return catchWebByMetaSearchList;

		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}
}
