package com.zzsn.knowbase.util;

import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ContentUtility {

    static Pattern divNoneP = Pattern.compile("(?s)<div[^>]*display:none[^>]*>.*?</div>", Pattern.CASE_INSENSITIVE);

    static Pattern divP = Pattern.compile("<div>", Pattern.CASE_INSENSITIVE);

    static Pattern divRP = Pattern.compile("</div>", Pattern.CASE_INSENSITIVE);

    static Pattern brP = Pattern.compile("<br />", Pattern.CASE_INSENSITIVE);

    static Pattern br2P = Pattern.compile("<br>", Pattern.CASE_INSENSITIVE);

    static Pattern spaceP = Pattern.compile("&nbsp;", Pattern.CASE_INSENSITIVE);

    static Pattern strongP = Pattern.compile("<strong>", Pattern.CASE_INSENSITIVE);

    static Pattern strongRP = Pattern.compile("</strong>", Pattern.CASE_INSENSITIVE);

    static Pattern pP = Pattern.compile("<p>", Pattern.CASE_INSENSITIVE);

    static Pattern pRP = Pattern.compile("</p>", Pattern.CASE_INSENSITIVE);

    static Pattern centerP = Pattern.compile("<center[^>]*>", Pattern.CASE_INSENSITIVE);

    static Pattern centerRP = Pattern.compile("</center>", Pattern.CASE_INSENSITIVE);

    static Pattern removeAttrP = Pattern.compile("<([a-zA-Z0-9]+)[^>]*>", Pattern.CASE_INSENSITIVE);

    static Pattern commentP = Pattern.compile("(?s)<!--[^>]*>.*?<![^>]*-->", Pattern.CASE_INSENSITIVE);

    static Pattern inputP = Pattern.compile("<input[^>]*>", Pattern.CASE_INSENSITIVE);

    static Pattern formP = Pattern.compile("<form[^>]*>", Pattern.CASE_INSENSITIVE);

    static Pattern formRP = Pattern.compile("</form>", Pattern.CASE_INSENSITIVE);

    static Pattern buttonP = Pattern.compile("(?s)<button[^>]*>.*?</button>", Pattern.CASE_INSENSITIVE);

    static Pattern iframeP = Pattern.compile("(?s)<iframe[^>]*>.*?</iframe>", Pattern.CASE_INSENSITIVE);

    static Pattern noscriptP = Pattern.compile("(?s)<noscript>.*?</noscript>", Pattern.CASE_INSENSITIVE);

    static Pattern objectP = Pattern.compile("(?s)<object[^>]*>.*?</object>", Pattern.CASE_INSENSITIVE);

    static Pattern linkP = Pattern.compile("(?s)<link[^>]*>", Pattern.CASE_INSENSITIVE);

    static Pattern imgReplaceP = Pattern.compile("<img([^>]*)>", Pattern.CASE_INSENSITIVE);

    static Pattern imgRevReplaceP = Pattern.compile("<_img([^>]*)>", Pattern.CASE_INSENSITIVE);

    static Pattern imgP = Pattern.compile("<img[^>]*>", Pattern.CASE_INSENSITIVE);

    static Pattern imgRP = Pattern.compile("</img>", Pattern.CASE_INSENSITIVE);

    public static Pattern aRemoveP = Pattern.compile("(?s)<a[^>]*>.*?</a>", Pattern.CASE_INSENSITIVE);

    static Pattern legendRemoveP = Pattern.compile("(?s)<legend[^>]*>.*?</legend>", Pattern.CASE_INSENSITIVE);

    static Pattern aP = Pattern.compile("<a[^>]*>", Pattern.CASE_INSENSITIVE);

    static Pattern aRP = Pattern.compile("</a>", Pattern.CASE_INSENSITIVE);

    static Pattern fontP = Pattern.compile("<font[^>]*>", Pattern.CASE_INSENSITIVE);

    static Pattern fontRP = Pattern.compile("</font>", Pattern.CASE_INSENSITIVE);

    static Pattern hP = Pattern.compile("<h\\d[^>]*>", Pattern.CASE_INSENSITIVE);

    static Pattern hRP = Pattern.compile("</h\\d>", Pattern.CASE_INSENSITIVE);

    static Pattern ulRP = Pattern.compile("</ul>", Pattern.CASE_INSENSITIVE);

    static Pattern liRP = Pattern.compile("</li>", Pattern.CASE_INSENSITIVE);

    static Pattern trRP = Pattern.compile("</tr>", Pattern.CASE_INSENSITIVE);

    static Pattern tdRP = Pattern.compile("</td>", Pattern.CASE_INSENSITIVE);

    static Pattern textareaRemoveP = Pattern.compile("(?s)<textarea[^>]*>.*?</textarea>", Pattern.CASE_INSENSITIVE);

    static Pattern selectRemoveP = Pattern.compile("(?s)<select[^>]*>.*?</select>", Pattern.CASE_INSENSITIVE);

    static Pattern optionRemoveP = Pattern.compile("(?s)<option[^>]*>.*?</option>", Pattern.CASE_INSENSITIVE);

    static Pattern labelRemoveP = Pattern.compile("(?s)<label[^>]*>.*?</label>", Pattern.CASE_INSENSITIVE);

    static String regHTMLNumcode = "&#(\\d{4,5});";

    static Pattern patHTMLNumCode = Pattern.compile(regHTMLNumcode);
    /**
     * 去掉无用的HTML标签，包括a等
     * @param htmlText
     * @return
     */
    public static String RemoveUselessHTMLTagX(String htmlText) {
        try {
            htmlText = ContentUtility.RemoveStyleCode(htmlText);

            htmlText = htmlText.replaceAll("&nbsp;", " ");

            htmlText = divNoneP.matcher(htmlText).replaceAll("");

            htmlText = textareaRemoveP.matcher(htmlText).replaceAll("");
            htmlText = selectRemoveP.matcher(htmlText).replaceAll("");
            htmlText = optionRemoveP.matcher(htmlText).replaceAll("");
            htmlText = labelRemoveP.matcher(htmlText).replaceAll("");
            htmlText = inputP.matcher(htmlText).replaceAll("");
            htmlText = formP.matcher(htmlText).replaceAll("");
            htmlText = buttonP.matcher(htmlText).replaceAll("");
            htmlText = formRP.matcher(htmlText).replaceAll("");

            //    htmlText = imgReplaceP.matcher(htmlText).replaceAll("<_img$1>");
            htmlText = removeUselessAtt(htmlText);
            //  htmlText = removeAttrP.matcher(htmlText).replaceAll("<$1>");
            //    htmlText = imgRevReplaceP.matcher(htmlText).replaceAll("<img$1>");

            htmlText = commentP.matcher(htmlText).replaceAll("");
            htmlText = legendRemoveP.matcher(htmlText).replaceAll("");

            // htmlText = aP.matcher(htmlText).replaceAll("<sapn>");
            //  htmlText = aRP.matcher(htmlText).replaceAll("</sapn>");
            htmlText = iframeP.matcher(htmlText).replaceAll("");
            htmlText = noscriptP.matcher(htmlText).replaceAll("");
            htmlText = objectP.matcher(htmlText).replaceAll("");
            //htmlText = imgP.matcher(htmlText).replaceAll("");
            //htmlText = imgRP.matcher(htmlText).replaceAll("");

            htmlText = centerP.matcher(htmlText).replaceAll("");
            htmlText = centerRP.matcher(htmlText).replaceAll("");

            htmlText = htmlText.replaceAll("<cufontext>", "");
            htmlText = htmlText.replaceAll("</cufontext>", "");
            htmlText = htmlText.replaceAll("<cufon>", "");
            htmlText = htmlText.replaceAll("</cufon>", "");

            //htmlText = htmlText.replaceAll("(?s)<([a-zA-Z0-9]+)[^>]*>\\s*(</$1>)", "");

            htmlText = htmlText.replaceAll("(?s)<ul[^>]*>\\s*</ul>", "");
            htmlText = htmlText.replaceAll("(?s)<div[^>]*>\\s*</div>", "");
            htmlText = htmlText.replaceAll("(?s)<p[^>]*>\\s*</p>", "");
            htmlText = htmlText.replaceAll("(?s)<li[^>]*>\\s*</li>", "");
            htmlText = htmlText.replaceAll("(?s)<canvas[^>]*>\\s*</canvas>", "");

            return htmlText;
        } catch (Exception e) {
            e.printStackTrace();
            return htmlText;
        }
    }

    public static String RemoveHTMLCode(String src) {
        src = src.replaceAll("(<[^>]*>)\\s*(<[^>]*>)", "$1$2");
        src = divP.matcher(src).replaceAll("\n\n");
        src = divRP.matcher(src).replaceAll("\n\n");
        src = brP.matcher(src).replaceAll("\n\n");
        src = br2P.matcher(src).replaceAll("\n\n");
        src = spaceP.matcher(src).replaceAll(" ");
        src = src.replaceAll("&#8226;", "??");
        src = strongP.matcher(src).replaceAll("");
        src = strongRP.matcher(src).replaceAll("");
        src = pP.matcher(src).replaceAll("\n\n");
        src = pRP.matcher(src).replaceAll("\n\n");
        src = aP.matcher(src).replaceAll("");
        src = aRP.matcher(src).replaceAll("");
        src = imgP.matcher(src).replaceAll("");
        src = fontP.matcher(src).replaceAll("");
        src = fontRP.matcher(src).replaceAll("");
        src = hRP.matcher(src).replaceAll("\n\n");
        src = ulRP.matcher(src).replaceAll("\n\n");
        src = liRP.matcher(src).replaceAll("\n\n");
        src = trRP.matcher(src).replaceAll("\n\n");
        src = tdRP.matcher(src).replaceAll("\n\n");

        src = src.replaceAll("<[^>]*>", "");

        return src.trim();
    }

    /**
     * 去除除table的td外的无用的html标签属性
     * 创建人:  李东亮
     * 创建时间:  2016-7-14 下午5:01:20
     * @version 1.0
     * @param htmlText
     * @return
     */
    public static String removeUselessAtt(String htmlText) {
        Matcher m = removeAttrP.matcher(htmlText);
        Map<String, String> replaceMap = new HashMap<String, String>();
        String tagPre;
        while (m.find()) {
            tagPre = m.group();
            if (!tagPre.startsWith("<td ") && !tagPre.startsWith("<TD ") && !tagPre.startsWith("<th ") && !tagPre.startsWith("<TH ")
                    && !tagPre.startsWith("<img ") && !tagPre.startsWith("<IMG ") && !tagPre.startsWith("<a ") && !tagPre.startsWith("<A ")) {
                replaceMap.put(tagPre, removeAttrP.matcher(tagPre).replaceAll("<$1>"));
            }
        }

        String replaceTagPre;
        for (String key : replaceMap.keySet()) {
            replaceTagPre = replaceMap.get(key);
            while (htmlText.contains(key) && !key.equals(replaceTagPre)) {
                htmlText = htmlText.replace(key, replaceTagPre);
            }
        }

        return htmlText;
    }

    public static String HTMLDecode(String str) {
        //
        // 去掉一些HTML编码
        str = str.replaceAll("&quot;", "\"");
        str = str.replaceAll("&nbsp;", " ");
        str = str.replaceAll("&middot;", "·");
        str = str.replaceAll("&amp;", "&");
        str = str.replaceAll("&ldquo;", "“");
        str = str.replaceAll("&rdquo;", "”");
        str = str.replaceAll("&gt;", ">");
        str = str.replaceAll("&lt;", "<");
        str = str.replaceAll("&raquo;", "??");
        str = str.replaceAll("&times;", "×");
        str = str.replaceAll("&ccedil;", "??");
        str = str.replaceAll("&atilde;", "??");
        str = str.replaceAll("&ecirc;", "ê");

        // 去掉<>
        //
        str = str.replaceAll("<\\?[^>]*>", "");

        Matcher matcher = patHTMLNumCode.matcher(str);
        while (matcher.find()) {
            str = matcher.replaceFirst(String.valueOf((char) Integer.parseInt(matcher.group(1))));
            matcher = patHTMLNumCode.matcher(str);
        }

        String[] tmp = str.split(";&#|&#|;");
        StringBuffer sb = new StringBuffer("");

        for (int i = 0; i < tmp.length; i++) {
            if (tmp[i].matches("\\d{4,5}")) {
                sb.append((char) Integer.parseInt(tmp[i]));
            } else {
                sb.append(tmp[i]);
            }
        }
        str = sb.toString();

        return str;
    }

    public static String RemoveHTMLReturnCode(String src) {
        //src = src.replaceAll("(<[^>]*>)[\r\n]+(<[^>]*>)", "$1$2");
        src = src.replaceAll("\r", "");
        src = src.replaceAll("\n", "");
        return src;
    }

    /**
     * 提取html字符串转中的普通文本，注意处理其中的回车符
     * @param htmlText
     * @return
     */
    public static String TransferHTML2Text(String htmlText) {
        if(htmlText==null){
            return null;
        }
        String text = ContentUtility.HTMLDecode(ContentUtility.RemoveHTMLCode(ContentUtility.RemoveStyleCode(ContentUtility.RemoveHTMLReturnCode(htmlText))));
        text = text.replaceAll(" 　　", "\r\n");

        text = text.replaceAll(" +\r\n", "\r\n");
        text = text.replaceAll(" +", " ");
        text = text.replaceAll("[\\u00A0\\u3000]", "");
        text = text.replaceAll("　", "");

        return text;
    }

    public static String RemoveStyleCode(String content) {

        try {
            Pattern p1 = Pattern.compile("(?s)<script\\s*.*?>(.*?)</script>", Pattern.CASE_INSENSITIVE);
            Matcher m1 = p1.matcher(content);
            content = m1.replaceAll("");

            Pattern p2 = Pattern.compile("(?s)<style\\s*.*?>(.*?)</style>", Pattern.CASE_INSENSITIVE);
            Matcher m2 = p2.matcher(content);
            content = m2.replaceAll("");

            Pattern p11 = Pattern.compile("(?s)<script\\s*.*?/>", Pattern.CASE_INSENSITIVE);
            Matcher m11 = p11.matcher(content);
            content = m11.replaceAll("");

            Pattern p21 = Pattern.compile("(?s)<style\\s*.*?/>", Pattern.CASE_INSENSITIVE);
            Matcher m21 = p21.matcher(content);
            content = m21.replaceAll("");

            content = noscriptP.matcher(content).replaceAll("");
            content = objectP.matcher(content).replaceAll("");
            content = linkP.matcher(content).replaceAll("");

            Pattern p22 = Pattern.compile("(?s)<img\\s*.*?/>", Pattern.CASE_INSENSITIVE);
            Matcher m22 = p22.matcher(content);
            content = m22.replaceAll("");

            // 去除注释
            // Pattern p3 = Pattern.compile("(?s)<!--\\s*.*?>(.*?)-->");
            Pattern p3 = Pattern.compile("(?s)<!--.*?-->");
            Matcher m3 = p3.matcher(content);
            content = m3.replaceAll("");
        } catch (Exception e) {
            e.printStackTrace();
        }
        return content;

    }

    /**
     * 返回匹配的域名。例如：www.baidu.com
     * @version 1.0
     * @param sourceAddress
     * @return
     */
    public static String domainURL(String sourceAddress){
        if (sourceAddress==null || sourceAddress.trim().length()==0) {
            return "";
        }
        String result = "";

        try {
            String regex = "(?<=//|)((\\w)+\\.)+[\\s\\S]+?(?=\\/|\\:|\\?)" ;
            Pattern p = Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
            Matcher matcher = p.matcher(sourceAddress);
            matcher.find();
            result = matcher.group();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }

    /**
     * 去除域名的前缀（栏目）
     * 例如：输入：finance.sina.com.cn  ; 输出：  sina.com.cn  ;
     * @version 1.0
     * @param domainStr
     * @return
     */
    public static String cutDomainPrefix(String domainStr){
        String result = "";

        try {
            String regex = "[a-zA-Z0-9-]+.(cn|com|cdt|com.mo|nl|us|biz|de|org.sa|info|ee|org.zw|co.uk|ie|com.sg|co.ke|be|eu|com.cn|gov.cn|co.kr|sh.cn|cssn.cn|org|ac.cn|co|org.cn|net|org.uk|hk|fr|no|se|org.sg|bg|org.pl|cz|at|org.nz|or.jp|mu|org.pe|com.hk|net.cn|mil|edu|edu.cn|cas.cn|hk|tw|tv|me|cc|COM|ORG|NET|MIL|EDU)$";
            Pattern p = Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
            Matcher matcher = p.matcher(domainStr);
            while (matcher.find()) {
                result = matcher.group();
                break;
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return result;
    }
}
