package com.zzsn.service.impl;

import com.zzsn.dao.ProcessitemDAO;
import com.zzsn.entity.CatchWebByMetaSearch;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.Processitem;
import com.zzsn.entity.SiteTemplate;
import com.zzsn.service.ProcessitemService;
import com.zzsn.util.VerifySiteUtil;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import lombok.extern.slf4j.Slf4j;
import org.eclipse.jetty.util.UrlEncoded;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Service
@Slf4j
public class ProcessitemServiceImpl  extends ServiceImpl<ProcessitemDAO, Processitem> implements ProcessitemService {

    @Override
    public List<DocInfo> searchKeyword(String keywords) {
        VerifySiteUtil verifySiteUtil=new VerifySiteUtil();
        String[] keys = keywords.split(",");
        Long orgId =111L;
        Long tid = 222L;
        List< SiteTemplate > siteTemplateList=new ArrayList<>();
        List<DocInfo> docInfoList=null;
        for (int i = 0; i <keys.length ; i++) {
            log.info("搜索关键词：" + keys[i]);
            String url1="https://www.baidu.com/s?ie=utf-8&medium=0&rtt=1&bsst=1&rsv_dl=news_t_sk&cl=2&wd=[keyword]&tn=news&rsv_bp=1&oq=&rsv_btype=t&f=8";
//            String url1="http://www.baidu.com/s?rtt=4&bsst=1&cl=2&tn=news&rsv_dl=ns_pc&lqst=1&x_bd_lqst=1&word=";
            String kk= UrlEncoded.encodeString(keys[i]);
            String url = url1.replace("[keyword]",kk);
            List<String> urlList = new ArrayList<String>();
            log.info("url：" + url);
            urlList.add(url);
            String charset = LocateCharSet(urlList.get(0));
            List<CatchWebByMetaSearch> catchWebByMetaSearches = verifySiteUtil.CatchWebOfBaiduByProxy(urlList, charset, orgId, tid);
            docInfoList = verifySiteUtil.CatchWebNews(catchWebByMetaSearches, keys[i]);
        }
        return docInfoList;
    }
    // 获取所要抓取网页的编码方式
    private String LocateCharSet(String url) {
        String encoding = "gbk";
        try {
            Connection conn = Jsoup.connect(url);
            conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36)");

            // 伪装成浏览器
            Document doc = conn.ignoreContentType(true).timeout(10000).get();

            Pattern p1 = Pattern.compile("<meta[^>]*>",
                    Pattern.CASE_INSENSITIVE);
            Matcher m1 = p1.matcher(doc.toString());
            while (m1.find()) {
                String str = m1.group();
                Pattern p2 = Pattern.compile("charset[^\\s||\"||;||'||>]*");
                Matcher m2 = p2.matcher(str);
                if (m2.find()) {
                    encoding = m2.group().substring(8);
                    if (encoding.trim().length() == 0) {
                        Pattern p3 = Pattern
                                .compile("charset=\"[^\\s||\"||;||>]*");
                        Matcher m3 = p3.matcher(str);
                        if (m3.find()) {
                            encoding = m3.group().substring(9);
                        }
                        if (encoding.trim().length() == 0) {
                            // encoding = DetectCharSet.detectCharSet(fileName);
                            // if(encoding == null){
                            encoding = "gbk";
                            // }
                        }
                    }

                    return encoding;
                }
            }
        } catch (IOException e) {
            // e.printStackTrace();
            System.out.println("获取出错编码方式");
            return encoding;
        }

        return encoding;
    }

}
