package com.zzsn.crawler;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.zzsn.configuration.SpringContextUtil;
import com.zzsn.entity.ClbAnsProcessitem;
import com.zzsn.entity.DocInfo;
import com.zzsn.entity.SiteMsgTemple;
import com.zzsn.extractor.ContentFileFinder;
import com.zzsn.extractor.ExtEntity;
import com.zzsn.extractor.FileTag;
import com.zzsn.extractor.WeiXinDispatch;
import com.zzsn.job.JedisUtil;
import com.zzsn.util.Constants;
import com.zzsn.util.ContentUtility;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.kafka.core.KafkaTemplate;

import java.util.HashMap;
import java.util.Map;

@Slf4j
public class WeixinDetailThread extends Thread{

    public SiteMsgTemple siteMsgTemple =new SiteMsgTemple();

    public KafkaTemplate kafkaTemplate= SpringContextUtil.getBean(KafkaTemplate.class);
    @Override
    public void run() {
        detailCrawler();
    }

    public  boolean detailCrawler(){
         boolean flag=false;
        String weixinurl = siteMsgTemple.getSiteUri();
        //判断是否已爬取
        try {
            String urlflag = JedisUtil.getString(weixinurl+"_"+siteMsgTemple.getId());
            if (!StringUtils.isEmpty(urlflag)) {
                log.info("已爬取" + weixinurl);
                return flag;
            }
        }catch (Exception e){
            log.info("redis获取信息失败");
        }
        String weixinid=getParam(weixinurl);
        log.info("爬取的微信id= "+weixinid);

        WeiXinDispatch wx=new WeiXinDispatch();
        ExtEntity extEntity=wx.getExtractorElement(weixinurl);
        String contentNoTag = null;
        String formatImgContent= ContentFileFinder.getContentImgTag(extEntity.getContentWithTag(),"https://mp.weixin.qq.com/s/DePy9GFzh1tL844ik9YuWw");
//        System.out.println(extEntity.getContentWithTag());
//        String formatImgContent=extEntity.getContentWithTag();

        extEntity.setContentWithTag(formatImgContent);
        DocInfo docInfo=new DocInfo();
        docInfo.setSid(Long.parseLong(siteMsgTemple.getId()));
        docInfo.setSourceType("WeChat");
        docInfo.setSourceaddress(weixinurl);
        docInfo.setLang("zh_CN");
        docInfo.setContentType("HTML");
        docInfo.setSourceType("News");
        docInfo.setCharset("utf-8");
        docInfo.setTitle(extEntity.getTitle());
        docInfo.setAuthor(extEntity.getAuthor());
        docInfo.setPublishDate(extEntity.getPublishDate());
        docInfo.setOrigin("微信公众号-"+extEntity.getAuthor());
        StringBuffer sb = new StringBuffer();
        sb.append("<html><head>");
        sb.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />");
        sb.append("<title></title></head><body>");
        sb.append(extEntity.getContentWithTag());
        sb.append("</body></html>");
        contentNoTag=ContentUtility.TransferHTML2Text(sb.toString());
        docInfo.setContentWithTag(sb.toString());
        docInfo.setContentNoTag(contentNoTag);
        docInfo.setContentImgCvtTag(sb.toString());
        ObjectMapper mapper = new ObjectMapper();
        try {
            ClbAnsProcessitem processitem =docInfoTrans2Processitem(docInfo);
            processitem.setSource("微信爬虫");
            if(StringUtils.isEmpty(processitem.getTitle())|| StringUtils.isEmpty(processitem.getContent())){
                System.out.println("资讯的信息不全没有发送");
            }
            String docjson = mapper.writeValueAsString(processitem);
            kafkaTemplate.send(Constants.KAFKA_PRODUCT_TOPIC, "key", docjson);
            log.info("发送到kafka成功。");
            flag=true;
            //标记已爬取
//            JedisUtil.setString(weixinurl,"1",-1);
            JedisUtil.setString(weixinurl+"_"+siteMsgTemple.getId(),"1",-1);
        } catch (Exception e) {
//                    e.printStackTrace();
            log.info("发送到kafka失败。");
        }

        return flag;


    }
    public ClbAnsProcessitem docInfoTrans2Processitem(DocInfo docInfo){
        ClbAnsProcessitem clbAnsProcessitem=new ClbAnsProcessitem();
        clbAnsProcessitem.setSid(docInfo.getSid()+"");
        clbAnsProcessitem.setTitle(docInfo.getTitle());
        clbAnsProcessitem.setContent(docInfo.getContentNoTag());
        clbAnsProcessitem.setContentWithtag(docInfo.getContentWithTag());
        clbAnsProcessitem.setSummary(docInfo.getSummary());
        clbAnsProcessitem.setAuthor(docInfo.getAuthor());
        clbAnsProcessitem.setOrigin(docInfo.getOrigin());
        clbAnsProcessitem.setPublishDate(docInfo.getPublishDate());
        clbAnsProcessitem.setSourceAddress(docInfo.getSourceaddress());

        return clbAnsProcessitem;
    }

    public static Map<String,String> parse(String url) {
        Map<String,String> map=new HashMap<String,String>();
        if (url == null) {
            return map;
        }
        url = url.trim();
        if (url.equals("")) {
            return map;
        }
        String[] urlParts = url.split("\\?");
        String uri = urlParts[0];
        //没有参数
        if (urlParts.length == 1) {
            return map;
        }
        //有参数
        String[] params = urlParts[1].split("&");
        for (String param : params) {
            String[] keyValue = param.split("=");
            map.put(keyValue[0], keyValue[1]);
        }

        return map;
    }
    public static String getParam(String url) {

        Map<String, String> map=new HashMap<String, String>();
        try {
            map = parse(url);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            System.out.println(url);
        }
        return map.get("__biz");
    }

}
