/*
 * Decompiled with CFR 0.152.
 */
package com.hankcs.test.corpus;

import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.corpus.dictionary.TFDictionary;
import com.hankcs.hanlp.corpus.document.CorpusLoader;
import com.hankcs.hanlp.corpus.document.Document;
import com.hankcs.hanlp.corpus.document.sentence.word.CompoundWord;
import com.hankcs.hanlp.corpus.document.sentence.word.IWord;
import com.hankcs.hanlp.corpus.io.FolderWalker;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.dictionary.CoreDictionary;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.List;
import junit.framework.TestCase;

public class AdjustCorpus
extends TestCase {
    public void testAdjust() throws Exception {
        List<File> fileList = FolderWalker.open("D:\\JavaProjects\\CorpusToolBox\\data\\2014\\");
        for (File file : fileList) {
            AdjustCorpus.handle(file);
        }
    }

    private static void handle(File file) {
        try {
            String text = IOUtil.readTxt(file.getPath());
            int length = text.length();
            text = AdjustCorpus.addW(text, "\uff1a");
            text = AdjustCorpus.addW(text, "\uff1f");
            text = AdjustCorpus.addW(text, "\uff0c");
            text = AdjustCorpus.addW(text, "\uff09");
            text = AdjustCorpus.addW(text, "\uff08");
            text = AdjustCorpus.addW(text, "\uff01");
            text = AdjustCorpus.addW(text, "(");
            text = AdjustCorpus.addW(text, ")");
            text = AdjustCorpus.addW(text, ",");
            text = AdjustCorpus.addW(text, "\u2018");
            text = AdjustCorpus.addW(text, "\u2019");
            text = AdjustCorpus.addW(text, "\u201c");
            text = AdjustCorpus.addW(text, "\u201d");
            text = AdjustCorpus.addW(text, ";");
            text = AdjustCorpus.addW(text, "\u2026\u2026");
            text = AdjustCorpus.addW(text, "\u3002");
            text = AdjustCorpus.addW(text, "\u3001");
            text = AdjustCorpus.addW(text, "\u300a");
            text = AdjustCorpus.addW(text, "\u300b");
            if (text.length() != length) {
                BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file)));
                bw.write(text);
                bw.close();
                System.out.println("\u4fee\u6b63\u4e86" + file);
            }
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    private static String addW(String text, String c) {
        text = text.replaceAll("\\" + c + "/w ", c);
        return text.replaceAll("\\" + c, String.valueOf(c) + "/w ");
    }

    public void testPlay() throws Exception {
        final TFDictionary tfDictionary = new TFDictionary();
        CorpusLoader.walk("D:\\JavaProjects\\CorpusToolBox\\data\\2014", new CorpusLoader.Handler(){

            @Override
            public void handle(Document document) {
                for (List<IWord> wordList : document.getComplexSentenceList()) {
                    for (IWord word : wordList) {
                        if (!(word instanceof CompoundWord) || !word.getLabel().equals("ns")) continue;
                        tfDictionary.add(word.toString());
                    }
                }
            }
        });
        tfDictionary.saveTxtTo("data/test/complex_ns.txt");
    }

    public void testAdjustNGram() throws Exception {
        IOUtil.LineIterator iterator = new IOUtil.LineIterator(HanLP.Config.BiGramDictionaryPath);
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter((OutputStream)new FileOutputStream(String.valueOf(HanLP.Config.BiGramDictionaryPath) + "adjust.txt"), "UTF-8"));
        while (iterator.hasNext()) {
            String line = iterator.next();
            String[] params = line.split(" ");
            String first = params[0].split("@", 2)[0];
            String second = params[0].split("@", 2)[1];
            int biFrequency = Integer.parseInt(params[1]);
            CoreDictionary.Attribute attribute = CoreDictionary.get(String.valueOf(first) + second);
            if (attribute != null && (first.length() == 1 || second.length() == 1)) {
                System.out.println(line);
                continue;
            }
            bw.write(line);
            bw.newLine();
        }
        bw.close();
    }
}

