/*
 * Decompiled with CFR 0.152.
 */
package marytts.modules;

import de.dfki.lt.tools.tokenizer.JTok;
import de.dfki.lt.tools.tokenizer.annotate.AnnotatedString;
import de.dfki.lt.tools.tokenizer.annotate.FastAnnotatedString;
import java.util.Locale;
import java.util.Properties;
import marytts.datatypes.MaryData;
import marytts.datatypes.MaryDataType;
import marytts.datatypes.MaryXML;
import marytts.modules.InternalModule;
import marytts.util.dom.DomUtils;
import marytts.util.dom.MaryDomUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.Text;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeIterator;

public class JTokeniser
extends InternalModule {
    public static final int TOKEN_MAXLENGTH = 100;
    private JTok jtok;
    private String jtokLocale;

    public JTokeniser() {
        this((Locale)null);
    }

    public JTokeniser(String locale) {
        super("JTokeniser", MaryDataType.RAWMARYXML, MaryDataType.TOKENS, new Locale(locale));
    }

    public JTokeniser(Locale locale) {
        this(MaryDataType.RAWMARYXML, MaryDataType.TOKENS, locale);
    }

    public JTokeniser(MaryDataType inputType, MaryDataType outputType, Locale locale) {
        super("JTokeniser", inputType, outputType, locale);
        this.jtokLocale = locale == null ? "en" : locale.getLanguage();
    }

    protected void setTokenizerLanguage(String languageCode) {
        this.jtokLocale = languageCode;
    }

    @Override
    public void startup() throws Exception {
        super.startup();
        Properties jtokProperties = new Properties();
        jtokProperties.setProperty("languages", this.jtokLocale);
        jtokProperties.setProperty(this.jtokLocale, "jtok/" + this.jtokLocale);
        this.jtok = new JTok(jtokProperties);
    }

    @Override
    public MaryData process(MaryData d) throws Exception {
        Text textNode;
        Document doc = d.getDocument();
        NodeIterator ni = ((DocumentTraversal)((Object)doc)).createNodeIterator(doc, 4, null, false);
        StringBuilder inputText = new StringBuilder();
        while ((textNode = (Text)ni.nextNode()) != null) {
            String text = textNode.getData().trim();
            if (text.length() == 0) continue;
            if (inputText.length() > 0 && !Character.isWhitespace(inputText.charAt(inputText.length() - 1)) && Character.isLetterOrDigit(text.charAt(0))) {
                inputText.append(" ");
            }
            inputText.append(text);
        }
        FastAnnotatedString maryText = new FastAnnotatedString(inputText.toString());
        ni = ((DocumentTraversal)((Object)doc)).createNodeIterator(doc, 4, null, false);
        int pos = 0;
        while ((textNode = (Text)ni.nextNode()) != null) {
            String text = textNode.getData().trim();
            int len = text.length();
            if (len == 0) continue;
            if (pos > 0 && !Character.isWhitespace(inputText.charAt(pos - 1)) && Character.isLetterOrDigit(text.charAt(0))) {
                ++pos;
            }
            maryText.annotate("MARYXML", textNode, pos, pos + len);
            pos += len;
        }
        AnnotatedString tokenisedText = this.jtok.tokenize(inputText.toString(), this.jtokLocale);
        Element firstTokenInSentence = null;
        Element firstTokenInParagraph = null;
        Element previousToken = null;
        Node currentTextNode = null;
        char c = tokenisedText.setIndex(0);
        maryText.setIndex(0);
        while (c != '\uffff') {
            int tokenStart = tokenisedText.getRunStart("class");
            int tokenEnd = tokenisedText.getRunLimit("class");
            if (tokenisedText.getAnnotation("class") != null) {
                maryText.setIndex(tokenStart);
                Text tn = (Text)maryText.getAnnotation("MARYXML");
                assert (tn != null);
                Element t = null;
                if (MaryDomUtils.hasAncestor(tn, "t")) {
                    t = (Element)MaryDomUtils.getAncestor((Node)tn, "t");
                } else {
                    String token = tokenisedText.substring(tokenStart, tokenEnd);
                    t = MaryXML.createElement(doc, "t");
                    MaryDomUtils.setTokenText(t, token);
                    tn.getParentNode().insertBefore(t, tn);
                }
                if (currentTextNode != null && currentTextNode != tn && !MaryDomUtils.hasAncestor(currentTextNode, "t")) {
                    currentTextNode.getParentNode().removeChild(currentTextNode);
                }
                currentTextNode = tn;
                if (tokenisedText.getAnnotation("border") != null) {
                    if (firstTokenInSentence != null) {
                        assert (previousToken != null);
                        if (!MaryDomUtils.hasAncestor(firstTokenInSentence, "s") && !MaryDomUtils.hasAncestor(previousToken, "s")) {
                            Element firstPara = (Element)MaryDomUtils.getAncestor((Node)firstTokenInSentence, "p");
                            Element lastPara = (Element)MaryDomUtils.getAncestor((Node)previousToken, "p");
                            if (firstPara == null && lastPara == null || firstPara.equals(lastPara)) {
                                this.encloseWithSentence(firstTokenInSentence, previousToken);
                            }
                        }
                    }
                    firstTokenInSentence = null;
                    if (tokenisedText.getAnnotation("border") == "p") {
                        if (firstTokenInParagraph != null) {
                            assert (previousToken != null);
                            if (!MaryDomUtils.hasAncestor(firstTokenInParagraph, "p") && !MaryDomUtils.hasAncestor(previousToken, "p")) {
                                DomUtils.encloseNodesWithNewElement(DomUtils.getAncestor((Node)firstTokenInParagraph, "s"), DomUtils.getAncestor((Node)previousToken, "s"), "p");
                            }
                        }
                        firstTokenInParagraph = null;
                    }
                }
                previousToken = t;
                if (firstTokenInSentence == null) {
                    firstTokenInSentence = t;
                }
                if (firstTokenInParagraph == null) {
                    firstTokenInParagraph = t;
                }
            }
            c = tokenisedText.setIndex(tokenEnd);
            maryText.setIndex(tokenEnd);
        }
        if (currentTextNode != null && !MaryDomUtils.hasAncestor(currentTextNode, "t")) {
            currentTextNode.getParentNode().removeChild(currentTextNode);
        }
        if (firstTokenInSentence != null) {
            assert (previousToken != null);
            if (!MaryDomUtils.hasAncestor(firstTokenInSentence, "s") && !MaryDomUtils.hasAncestor(previousToken, "s")) {
                Element firstPara = (Element)MaryDomUtils.getAncestor(firstTokenInSentence, "p");
                Element lastPara = (Element)MaryDomUtils.getAncestor(previousToken, "p");
                if (firstPara == null && lastPara == null || firstPara.equals(lastPara)) {
                    this.encloseWithSentence(firstTokenInSentence, previousToken);
                }
            }
        }
        if (firstTokenInParagraph != null) {
            assert (previousToken != null);
            if (!MaryDomUtils.hasAncestor(firstTokenInParagraph, "p") && !MaryDomUtils.hasAncestor(previousToken, "p")) {
                DomUtils.encloseNodesWithNewElement(DomUtils.getAncestor(firstTokenInParagraph, "s"), DomUtils.getAncestor(previousToken, "s"), "p");
            }
        }
        NodeIterator tIt = MaryDomUtils.createNodeIterator(doc.getDocumentElement(), "t");
        Element t = null;
        while ((t = (Element)tIt.nextNode()) != null) {
            String tokenText = MaryDomUtils.tokenText(t);
            if (tokenText.length() <= 100) continue;
            String cutTT = tokenText.substring(0, 100);
            this.logger.info("Cutting exceedingly long input token (length " + tokenText.length() + " ) to length " + 100 + ":\n" + "before: " + tokenText + "\nafter: " + cutTT);
            MaryDomUtils.setTokenText(t, cutTT);
        }
        MaryData result = new MaryData(this.outputType(), d.getLocale());
        result.setDocument(doc);
        return result;
    }

    private void encloseWithSentence(Element firstTokenInSentence, Element lastTokenInSentence) {
        Element encloseFromHere = firstTokenInSentence;
        Element maybeBoundary = DomUtils.getPreviousSiblingElement(firstTokenInSentence);
        if (maybeBoundary != null && maybeBoundary.getTagName().equals("boundary")) {
            encloseFromHere = maybeBoundary;
        }
        Element encloseToHere = lastTokenInSentence;
        maybeBoundary = DomUtils.getNextSiblingElement(lastTokenInSentence);
        if (maybeBoundary != null && maybeBoundary.getTagName().equals("boundary")) {
            encloseToHere = maybeBoundary;
        }
        DomUtils.encloseNodesWithNewElement(encloseFromHere, encloseToHere, "s");
    }
}

