package com.zzsn.search.soCrawler;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.cache.JedisUtil;
import com.zzsn.cache.MemcachedUtils;
import com.zzsn.docinfo.DocInfo;
import com.zzsn.entity.Site;
import com.zzsn.entity.SiteTemplate;
import com.zzsn.paser.SourceTemplateByTag;
import com.zzsn.search.db.DBManager;
import com.zzsn.search.db.SnowIdUtils;
import com.zzsn.search.entity.ClbAnsProcessitem;
import com.zzsn.search.entity.KeywordMsg;
import com.zzsn.search.extractor.ContentFileFinder;
import com.zzsn.search.extractor.StandardWebExtractorHandler;
import com.zzsn.search.util.SplitKeyword;
import com.zzsn.search.util.SpringContextUtil;
import com.zzsn.utility.index.Constants;
import com.zzsn.utility.model.CatchWebByMetaSearch;
import com.zzsn.utility.model.ContentFileResult;
import com.zzsn.utility.model.FileTag;
import com.zzsn.utility.util.RequestUtil;
import com.zzsn.utility.util.SeleniumTime;
import com.zzsn.utility.util.Utility;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.kafka.core.KafkaTemplate;
import org.springframework.scheduling.annotation.Async;

import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.sql.SQLException;
import java.sql.Types;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Slf4j
@Data
public class WebSoSearchThread implements Runnable {

    public KeywordMsg keywordMsg;
    public List<String> keywords;
    public Integer threadId;
    public String startTime;
    public String endTime;

    String cache_key="so_keyWords";

    @Override
    public void run() {
            crawler();

    }

    public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);

    @Async("asyncexecutorServiceWebBaidu")
    public void crawler(){
        //对传进来的关键词组进行组合
        String keyWord = keywordMsg.getKeyWord();
        List<String> keyWords = SplitKeyword.transForm(keyWord);
//        List<String> keyWords = keywords;

        log.info("keyWords：" + keyWords);
        if(keyWords.size()<0){
            return;
        }
        for (String kWord :keyWords) {
            try {
                boolean sismember = JedisUtil.sismember(cache_key, kWord);
                if (sismember) {
                    continue;
                }
            }catch (Exception e){
                log.info("缓存出问题");
            }
            String url1= Constants.META_SEARCH_URL;
//            String[] kwords=kWord.split("\\+");
            String url="";
            List<String> urlList = new ArrayList<String>();
            log.info("url：" + url);
            String charset = "utf-8";
            Long orgId = Long.parseLong(keywordMsg.getId());
            Long tid = Long.parseLong(keywordMsg.getId());
            String proxyid=Constants.PROXY;
            if(proxyid.equals("1")) {
                CatchWebNews(SoRecorderUtil.CatchWebOfGoogle1(urlList, charset, orgId, tid),kWord);
            }else {
                for (int i = 1; i < 100; i++) {
                    String urla = url1.replace("[keyword]",kWord);
                    urla=urla.replace("[page_num]",i+"");
                    urlList.add(urla);
                }
                List<CatchWebByMetaSearch> catchWebByMetaSearches = SoRecorderUtil.catchWebOfSougouList(urlList, charset, orgId, tid,kafkaTemplate);

                }
            try {
                JedisUtil.sadd(cache_key, kWord);
            }catch (Exception e){
                log.info("缓存保存数据失败！");
            }
        }


    }

    // 抓取新闻内容
    private void CatchWebNews(List<CatchWebByMetaSearch> catchWebList,String keyword) {
        try {
            int count = 0;
            int g=catchWebList.size()>5?5:catchWebList.size();
            for (int i = 0; i < g ; i++) {
//            for (int i = 0; i < catchWebList.size(); i++) {
                try {
                    CatchWebByMetaSearch cwbm = catchWebList.get(i);
                    // 判断该网址是否存在于缓存池中
                    String orgId = String.valueOf(cwbm.getOrgId());
                    String key = Constants.SOURCEADDRESS + orgId + cwbm.getTid()+3;
                    Object cacheObj = MemcachedUtils.get(key);
                    List<String> addressList = new ArrayList<String>();
                    if (cacheObj != null && !"null".equals(cacheObj)) {
                        addressList = (List<String>) cacheObj;
                        int flg = 0;
                        //判断缓存池中有没有该网址
                        for (String address : addressList) {
                            if (address.equals(cwbm.getSourceaddress())) {
                                flg = 1;
                                break;
                            }
                        }
                        //如果缓存池中存在该网址，证明已经处理过，不再处理。
                        if (flg == 1) {
                            System.out.println(cwbm.getSourceaddress()+" 数据重复");
                            continue;
                        }
                    }

                    String infourl = cwbm.getSourceaddress();
                    String infodata = "";
                    String charset = "";
                    System.out.println(cwbm.getTitle()+"=="+infourl);
                    if (infourl == null || infourl.contains(".pdf") || infourl.trim().length()==0|| infourl.contains(".PDF")||infourl.contains("download")) {
                        continue;
                    }
//                    infodata=getContentByUrl(infourl);
//					测试获取内容通过模拟浏览器获取
//                    infodata= ChromeUtil.getChromeDoc(infourl);
                    //					String docstr=ChromeUtil.getChromeDoc(urlList.get(i));
                    SeleniumTime seleniumTime=new SeleniumTime();
                    infodata=seleniumTime.getScopehtml(infourl);
                    if(StringUtils.isEmpty(infodata)){
                        try {
                            Thread.sleep(1000*5);
                            SeleniumTime seleniumTime2=new SeleniumTime();
                            infodata=seleniumTime2.getScopehtml(infourl);
                        } catch (InterruptedException e) {
                            e.printStackTrace();
                        }
                    }
                    if(infourl.contains("toutiao.com") &&(null == infodata || infodata.length() < 50)){
                        infodata = RequestUtil.getTaotiaoData(infourl );
                    }
                    if(StringUtils.isEmpty(infodata)){
                        System.out.println("122222222222222222222222/为空，则爬取下一个");
                        //为空，则爬取下一个
                        continue;
                    }
                    String contentCharset = Utility.getWebEncodingByStr(infodata);
                    String content = null;
                    if (infodata != null && charset != null&& contentCharset != null) {
//                        content = Utility.convertCharset(infodata, charset,charset);
                        content=infodata;
                    }
                    if (content != null) {
                        cwbm.setCharset(charset);
                        cwbm.setLastModify("");
                        cwbm.setContent(content);
                    }
                    DocInfo docInfo = new DocInfo();
                    docInfo.setContentType("HTML");
                    docInfo.setOrgId(cwbm.getOrgId());
                    docInfo.setSid(cwbm.getSid());
                    docInfo.setSourceType("News");
                    docInfo.setLastModified(cwbm.getLastModify());
                    docInfo.setCharset("utf-8");
                    docInfo.setSourceaddress(cwbm.getSourceaddress());
                    docInfo.setTitle(cwbm.getTitle().replace("...", ""));
                    docInfo.setAuthor(cwbm.getAuthor());
                    docInfo.setPublishDate(cwbm.getPublishDate());
                    docInfo.setOrigin(cwbm.getSourcesite());
                    docInfo.setKeywords(keyword);
                    docInfo.setSummary(cwbm.getSummary());
                    StandardWebExtractorHandler swe = new StandardWebExtractorHandler();
                    try {
//						判断是否存在对应域名的模板
                        if(infourl.contains("qq.com") && !infourl.contains("://new.qq.com")){
                            infourl= transqqURl(infourl);
                        }
                        String domainurl = new URL(infourl).getHost();
                        Object siteTempObj = MemcachedUtils.get("domainUri_"+domainurl);
                        SiteTemplate siteTemplate=new SiteTemplate();
                        if (siteTempObj != null && !"null".equals(siteTempObj)) {
                            Site site=(Site)siteTempObj;
                            siteTemplate.setMatchTitle(site.getMatchTitle());
                            siteTemplate.setMatchAuthor(site.getMatchAuthor());
                            siteTemplate.setMatchContent(site.getMatchContent());
                            siteTemplate.setMatchOrigin(site.getMatchOrigin());
                            siteTemplate.setMatchPublishDate(site.getMatchPublishDate());
                            siteTemplate.setMatchSummary(site.getMatchSummary());
                            System.out.println("1++++++++doPaserByTag");
                            docInfo= SourceTemplateByTag.doPaserByTag(content, docInfo, siteTemplate);
                        }
                        if(null!=docInfo.getContentWithTag()) {
                            System.out.println("使用模板解析内容成功"+domainurl);
                            log.info("使用模板解析内容成功"+domainurl);
                        }
                        if(null==docInfo.getContentWithTag() || docInfo.getContentWithTag().trim().length() == 0) {
                            SourceTemplateByTag.saveNoTempSite(cwbm);
                            swe.doHandler(content, docInfo);
                        }

                    } catch (Exception e1) {
                        log.info("模板解析异常"+e1.getMessage());
                        SourceTemplateByTag.saveNoTempSite(cwbm);
                        swe.doHandler(content, docInfo);
                    }
                    System.out.println(docInfo.getTitle()+"---"+docInfo.getSourceaddress());
                    docInfo.setFileDownLoadPath(null);
                    Map<String, String> params = new HashMap<String, String>();
                    params.put("fromWhere", "clb百度元搜索");
                    if (null!=cwbm.getTid()) {
                        params.put("tid", String.valueOf(cwbm.getTid()));
                    }
                    docInfo.setOtherParams(params);
                    if (docInfo.getTitle() != null
                            && docInfo.getTitle().trim().length() > 0
                            && docInfo.getContentNoTag() != null
                            && docInfo.getContentNoTag().trim().length() > 0) {
//                        String week = DateUtil.getDateBeforeWeek(new Date());
//                        && docInfo.getPublishDate().compareTo(week)>0
                        if (docInfo.getPublishDate()!=null && docInfo.getPublishDate().trim().length()>0
                                ) {
                            ContentFileResult contentFileResult = new ContentFileResult();
                            try {
                                contentFileResult = getContentFile(docInfo.getContentWithTag(),docInfo.getSourceaddress());
//								docInfo.setContentWithTag(contentFileResult.getContentImgCvtTag());
                                docInfo.setContentWithTag(ContentFileFinder.rmHtmlImgOrAtag(contentFileResult.getContentImgCvtTag()));

                                docInfo.setContentImgCvtTag(contentFileResult.getContentImgCvtTag());
                                Map<String, FileTag> imgMap = contentFileResult.getFileMap();
                                //
                                for (String keyImg : imgMap.keySet()) {
                                    FileTag fileTag = imgMap.get(keyImg);
                                    String savePath = fileTag.getSavePath();
                                    InputStream is = getImg(fileTag.getAbsolutePath());
                                    if (is!=null) {
                                        int size = is.available();
                                        if (size>0) {
//                                            mqSender.sendFile(is, true, savePath);
                                        }
                                    }


                                }

                            } catch (Exception e) {
                                // TODO Auto-generated catch block
//								e.printStackTrace();
                                log.info(e.getMessage());
                            }

                            System.out.println(docInfo.getTitle()+"---"+docInfo.getSourceaddress());
                            log.info("title:"+docInfo.getTitle()+"|address:"+docInfo.getSourceaddress()+
                                    "|content:"+(docInfo.getContentNoTag()==null?"":docInfo.getContentNoTag().length()+""));

                            intsertData(docInfo);
                            //信息转换
//                            ClbAnsProcessitem processitem = docInfoTrans2Processitem(docInfo);
//                            ObjectMapper mapper = new ObjectMapper();
//                            String docjson = mapper.writeValueAsString(processitem);
//                            System.out.println(docjson);
//                            kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);

                        }else {
                            log.info("资讯发布时间："+docInfo.getPublishDate());
                        }
                        count++;
                    }else {
                        log.info("资讯内容："+docInfo.getContentNoTag());
//                        System.out.println(docInfo.getContentNoTag());
                    }

                    // 加入缓存池中
                    System.out.println("加入缓存池");
                    addressList.add(docInfo.getSourceaddress());
                    MemcachedUtils.set(key,addressList,60*60*24);

                } catch (Exception e) {
                    log.info("访问出错"+e.getMessage());
                    continue;
                }

            }
            System.out.println("本次成功件数：" + count);
            log.info("本次成功件数：" + count);
        } catch (Exception e) {
            log.info("访问出错"+e.getMessage());
        }

    }
    static String insertSql = "insert into cis_ans_processitem " +
            " (id,sid, title,summary,publish_date,origin,author, content,words,keywords,sourceaddress) " +
            " values(?,?,?,?,?,?,?,?,?,?,?)";
    private static void intsertData(DocInfo docInfo)
    {
        long snowID = SnowIdUtils.uniqueLong();
        String id=snowID+"";
        String sid=docInfo.getSid()+"";
        String title=docInfo.getTitle();
            String summ = docInfo.getSummary();
        if(summ!=null && summ.length()>5000){
            summ=summ.substring(0,4900);
        }
        String summary=summ;
        String publish_date=docInfo.getPublishDate();
        String origin=docInfo.getOrigin();
        String author=docInfo.getAuthor();
        String content=docInfo.getContentNoTag();
        String words=docInfo.getContentWithTag();
        String keywords=docInfo.getKeywords();
        String sourceaddress=docInfo.getSourceaddress();

        DBManager dm = new DBManager();
        String[] coulmn = new String[]{id,  sid, title,summary,publish_date,origin,author,content,words,keywords,sourceaddress};
        int[] type = new int[]{Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR, Types.CHAR};

        try {
            boolean flag = dm.updateOrAdd(coulmn, type, insertSql);
            if(flag)
                System.out.println("插入成功");
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    //转换qq新闻链接
    public static String transqqURl(String oldurl){
        String patt="https://new.qq.com/omn/[date]/[pamars].html";
        String b1=oldurl.substring(oldurl.lastIndexOf("/")+1);
        String b2=getNumbers(b1);
        String curl=patt.replace("[date]",b2).replace("[pamars]",b1);
        return curl;
    }
    public static String getNumbers(String content) {

        Pattern pattern = Pattern.compile("\\d+");
        Matcher matcher = pattern.matcher(content);
        while (matcher.find()) {
            return matcher.group(0);
        }
        return "";
    }

    //根据url获取html内容
    private  String getContentByUrl(String infourl){
        String infodata = "";
        String charset="utf-8";
        CloseableHttpClient httpClient =createSSLClientDefault();
        HttpGet httpgeturl = new HttpGet(infourl);// Get请求
        httpgeturl.getParams().setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 60000);
        httpgeturl.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 60000);
        // 伪装成浏览器
        httpgeturl.setHeader("Content-Type","application/x-www-form-urlencoded;charset=utf-8");
        httpgeturl.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);");
        httpgeturl.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        HttpResponse httprespse=null;
        try {
            httprespse = httpClient.execute(httpgeturl);
        } catch (Exception e1) {
            System.out.println("请求失败。。更换协议");
            httpClient=createSSLClientDefaulttsl12();
            try {
                httprespse = httpClient.execute(httpgeturl);
            }catch (Exception e2) {
                log.info(e2.getMessage());
                return infodata;
            }
        }
        HttpEntity entitydata = httprespse.getEntity();// 获取返回数据
        // 上次更新时间取得
        Header lastModify = httprespse.getFirstHeader("Last-Modified");
        if (lastModify == null) {
            lastModify = httprespse.getLastHeader("Last-Modified");
        }
        String charstype = EntityUtils.getContentCharSet(entitydata);
        if (charstype != null) {
            charset = charstype;
        } else {
//            charset = locateCharSet(infourl);
            charset = "utf-8";
        }
        charset = Utility.charsetcheck(charset);
        try {
            infodata = EntityUtils.toString(entitydata, charset);
        }catch (Exception e3){
            log.info(e3.getMessage());
            return infodata;
        }
        httpgeturl.releaseConnection();
        return infodata;
    }

    private  CloseableHttpClient createSSLClientDefault(){
        try {
            SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy() {
                //信任所有
                @Override
                public boolean isTrusted(
                        java.security.cert.X509Certificate[] arg0, String arg1)
                        throws java.security.cert.CertificateException {
                    // TODO Auto-generated method stub
                    return true;
                }

            }).build();
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext);
            return HttpClients.custom().setSSLSocketFactory(sslsf).build();
        } catch (KeyManagementException e) {
            e.printStackTrace();
        } catch (NoSuchAlgorithmException e) {
            e.printStackTrace();
        } catch (KeyStoreException e) {
            e.printStackTrace();
        }
        return  HttpClients.createDefault();
    }

    private  CloseableHttpClient createSSLClientDefaulttsl12(){
        try {
            SSLContext sslContext = new SSLContextBuilder().useProtocol("TLSv1.2").loadTrustMaterial(null, new TrustStrategy() {
                //信任所有
                @Override
                public boolean isTrusted(
                        java.security.cert.X509Certificate[] arg0, String arg1)
                        throws java.security.cert.CertificateException {
                    // TODO Auto-generated method stub
                    return true;
                }

            }).build();
            SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext);
            return HttpClients.custom().setSSLSocketFactory(sslsf).build();
        } catch (KeyManagementException e) {
            e.printStackTrace();
        } catch (NoSuchAlgorithmException e) {
            e.printStackTrace();
        } catch (KeyStoreException e) {
            e.printStackTrace();
        }
        return  HttpClients.createDefault();
    }
    /**
     * 爬取图片
     * 创建人:  李东亮
     * 创建时间:  2016-5-10 上午10:57:20
     * @version 1.0
     * @param contentWithTag
     * @param sourceaddress
     * @throws Exception
     */
    private ContentFileResult getContentFile(String contentWithTag,String sourceaddress)throws Exception{
        String contentImgCvtTag = contentWithTag;
        String formatImgContent= contentWithTag;
        Map<String, FileTag> imgDataMap = ContentFileFinder.getContentFileTag(contentWithTag,sourceaddress);
        //key为图片爬取路径，value为图片保存路径
        Map<String, FileTag> imgMap = new HashMap<String, FileTag>();
        for (String key : imgDataMap.keySet()) {
            FileTag fileTag = imgDataMap.get(key);
            while (contentImgCvtTag.contains(key)) {
                //IMG_SERVER开头的路径
                contentImgCvtTag = contentImgCvtTag.replace(key, fileTag.getSaveTag());
            }
//            while (formatImgContent.contains(fileTag.getAbsolutePath())&&!key.equals(fileTag.getAbsoluteTag())) {
//                //转换为绝对路径
//                formatImgContent = formatImgContent.replace(key, fileTag.getAbsoluteTag());
//            }
            imgMap.put(fileTag.getAbsolutePath(), fileTag);
        }

        ContentFileResult cis = new ContentFileResult();
        cis.setContentAbsoulute(formatImgContent);
        cis.setContentImgCvtTag(contentImgCvtTag);
        cis.setFileMap(imgMap);
        return cis;
    }

    public InputStream getImg(String dataUrl){
        CloseableHttpClient httpClient = createSSLClientDefault();

        CloseableHttpResponse response = null;
        InputStream instream =null;
        try {
            HttpGet get = new HttpGet();
            get.setURI(new URI(dataUrl));
            response = httpClient.execute(get);
            HttpEntity entity = response.getEntity();
            if (entity != null) {
                //创建一个输入流对象
                instream = entity.getContent();

//                 BufferedReader reader = new BufferedReader(new InputStreamReader(instream));
//                 StringBuilder sb = new StringBuilder();
//                 String line = null;
//                 while ((line = reader.readLine()) != null) {
//                     sb.append(line + "\n");
//                 }
//                 instream.close();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }

        return instream;
    }

}
