package de.tudarmstadt.ukp.dkpro.core.stanfordnlp;

import de.tudarmstadt.ukp.dkpro.core.api.parameter.Messages;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import edu.stanford.nlp.international.arabic.process.ArabicTokenizer;
import edu.stanford.nlp.international.french.process.FrenchTokenizer;
import edu.stanford.nlp.international.spanish.process.SpanishTokenizer;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBEscapingProcessor;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.internal.EnhancedClassFile;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.codehaus.plexus.util.SelectorUtils;

@ResourceMetaData(name = "de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter", description = "", version = "1.8.0", vendor = "DKPro Core Project", copyright = "Copyright 2010-2015\n                            Ubiquitous Knowledge Processing (UKP) Lab\n                            Technische Universität Darmstadt")
@EnhancedClassFile
/* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenter.class */
public class StanfordSegmenter extends SegmenterBase {
    private static final Map<String, InternalTokenizerFactory> tokenizerFactories = new HashMap();
    public static final String PARAM_LANGUAGE_FALLBACK = "languageFallback";

    @ConfigurationParameter(name = PARAM_LANGUAGE_FALLBACK, mandatory = false)
    private String languageFallback;
    public static final String PARAM_BOUNDARY_TOKEN_REGEX = "boundaryTokenRegex";

    @ConfigurationParameter(name = PARAM_BOUNDARY_TOKEN_REGEX, mandatory = false, defaultValue = {WordToSentenceProcessor.DEFAULT_BOUNDARY_REGEX}, description = "The set of boundary tokens. If null, use default.")
    private String boundaryTokenRegex;
    public static final String PARAM_BOUNDARY_FOLLOWERS = "boundaryFollowers";

    @ConfigurationParameter(name = PARAM_BOUNDARY_FOLLOWERS, mandatory = false, defaultValue = {")", SelectorUtils.PATTERN_HANDLER_SUFFIX, "}", "\"", "'", "''", "’", "”", "-RRB-", "-RSB-", "-RCB-", ")", SelectorUtils.PATTERN_HANDLER_SUFFIX, "}"}, description = "This is a Set of String that are matched with .equals() which are allowed to be tacked onto\nthe end of a sentence after a sentence boundary token, for example \")\".")
    private Set<String> boundaryFollowers;
    public static final String PARAM_XML_BREAK_ELEMENTS_TO_DISCARD = "xmlBreakElementsToDiscard";

    @ConfigurationParameter(name = PARAM_XML_BREAK_ELEMENTS_TO_DISCARD, mandatory = false, description = "These are elements like \"p\" or \"sent\", which will be wrapped into regex for approximate XML\nmatching. They will be deleted in the output, and will always trigger a sentence boundary.")
    private Set<String> xmlBreakElementsToDiscard;
    public static final String PARAM_BOUNDARIES_TO_DISCARD = "boundaryToDiscard";

    @ConfigurationParameter(name = PARAM_BOUNDARIES_TO_DISCARD, mandatory = false, defaultValue = {"\n", "*NL*"}, description = "The set of regex for sentence boundary tokens that should be discarded.")
    private Set<String> boundariesToDiscard;
    public static final String PARAM_REGION_ELEMENT_REGEX = "regionElementRegex";

    @ConfigurationParameter(name = PARAM_REGION_ELEMENT_REGEX, mandatory = false, description = "A regular expression for element names containing a sentence region. Only tokens in such\nelements will be included in sentences. The start and end tags themselves are not included in\nthe sentence.")
    private String regionElementRegex;
    public static final String PARAM_NEWLINE_IS_SENTENCE_BREAK = "newlineIsSentenceBreak";

    @ConfigurationParameter(name = PARAM_NEWLINE_IS_SENTENCE_BREAK, mandatory = false, defaultValue = {"TWO_CONSECUTIVE"}, description = "Strategy for treating newlines as paragraph breaks.")
    private WordToSentenceProcessor.NewlineIsSentenceBreak newlineIsSentenceBreak;
    public static final String PARAM_TOKEN_REGEXES_TO_DISCARD = "tokenRegexesToDiscard";

    @ConfigurationParameter(name = PARAM_TOKEN_REGEXES_TO_DISCARD, mandatory = false, defaultValue = {}, description = "The set of regex for sentence boundary tokens that should be discarded.")
    private Set<String> tokenRegexesToDiscard;
    public static final String PARAM_IS_ONE_SENTENCE = "isOneSentence";

    @ConfigurationParameter(name = PARAM_IS_ONE_SENTENCE, mandatory = true, defaultValue = {"false"}, description = "Whether to treat all input as one sentence.")
    private boolean isOneSentence;
    public static final String PARAM_ALLOW_EMPTY_SENTENCES = "allowEmptySentences";

    @ConfigurationParameter(name = PARAM_ALLOW_EMPTY_SENTENCES, mandatory = true, defaultValue = {"false"}, description = "Whether to generate empty sentences.")
    private boolean allowEmptySentences;

    /* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenter$InternalArabicTokenizerFactory.class */
    private static class InternalArabicTokenizerFactory implements InternalTokenizerFactory {
        private InternalArabicTokenizerFactory() {
        }

        @Override // de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.InternalTokenizerFactory
        public Tokenizer<?> create(String str) {
            return ArabicTokenizer.newArabicTokenizer(new StringReader(str), new Properties());
        }
    }

    /* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenter$InternalFrenchTokenizerFactory.class */
    private static class InternalFrenchTokenizerFactory implements InternalTokenizerFactory {
        private InternalFrenchTokenizerFactory() {
        }

        @Override // de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.InternalTokenizerFactory
        public Tokenizer<?> create(String str) {
            return FrenchTokenizer.factory().getTokenizer(new StringReader(str), "tokenizeNLs=false");
        }
    }

    /* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenter$InternalPTBTokenizerFactory.class */
    private static class InternalPTBTokenizerFactory implements InternalTokenizerFactory {
        private InternalPTBTokenizerFactory() {
        }

        @Override // de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.InternalTokenizerFactory
        public Tokenizer<?> create(String str) {
            return new PTBTokenizer(new StringReader(str), new CoreLabelTokenFactory(), "invertible");
        }
    }

    /* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenter$InternalSpanishTokenizerFactory.class */
    private static class InternalSpanishTokenizerFactory implements InternalTokenizerFactory {
        private InternalSpanishTokenizerFactory() {
        }

        @Override // de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.InternalTokenizerFactory
        public Tokenizer<?> create(String str) {
            return SpanishTokenizer.factory(new CoreLabelTokenFactory(), null).getTokenizer(new StringReader(str));
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/stanfordnlp/StanfordSegmenter$InternalTokenizerFactory.class */
    public interface InternalTokenizerFactory {
        Tokenizer<?> create(String str);
    }

    @Override // de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase
    protected void process(JCas jCas, String str, int i) throws AnalysisEngineProcessException {
        int length;
        List<Token> list = null;
        String language = getLanguage(jCas);
        if (isWriteToken()) {
            list = new ArrayList();
            int i2 = 0;
            List list2 = getTokenizer(language, str).tokenize();
            loop0: for (int i3 = 0; i3 < list2.size(); i3++) {
                Object obj = list2.get(i3);
                String str2 = obj instanceof String ? (String) obj : null;
                if (obj instanceof CoreLabel) {
                    CoreLabel coreLabel = (CoreLabel) obj;
                    coreLabel.word();
                    int intValue = ((Integer) coreLabel.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)).intValue();
                    int intValue2 = ((Integer) coreLabel.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)).intValue();
                    list.add(createToken(jCas, i + intValue, i + intValue2, i3));
                    length = intValue2;
                } else {
                    if (obj instanceof Word) {
                        str2 = ((Word) obj).word();
                    }
                    if (str2 == null) {
                        throw new AnalysisEngineProcessException(new IllegalStateException("Unknown token type: " + obj.getClass()));
                    }
                    while (Character.isWhitespace(str.charAt(i2))) {
                        i2++;
                        if (i2 >= str.length()) {
                            break loop0;
                        }
                    }
                    if (!str.startsWith(str2, i2)) {
                        throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: [" + str2 + "] CAS: [" + str.substring(i2, Math.min(i2 + str2.length(), str.length()))));
                    }
                    list.add(createToken(jCas, i + i2, i + i2 + str2.length(), i3));
                    length = i2 + str2.length();
                }
                i2 = length;
            }
        }
        if (isWriteSentence()) {
            if (list == null) {
                list = JCasUtil.selectCovered(jCas, Token.class, i, i + str.length());
            }
            ArrayList arrayList = new ArrayList();
            for (Token token : list) {
                CoreLabel coreLabel2 = new CoreLabel();
                coreLabel2.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, Integer.valueOf(token.getBegin()));
                coreLabel2.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, Integer.valueOf(token.getEnd()));
                coreLabel2.setWord(token.getCoveredText());
                arrayList.add(coreLabel2);
            }
            new PTBEscapingProcessor().apply((List) arrayList);
            for (List list3 : new WordToSentenceProcessor(this.boundaryTokenRegex, this.boundaryFollowers, this.boundariesToDiscard, this.xmlBreakElementsToDiscard, this.regionElementRegex, this.newlineIsSentenceBreak, null, this.tokenRegexesToDiscard, this.isOneSentence, this.allowEmptySentences).process(arrayList)) {
                createSentence(jCas, ((Integer) ((CoreLabel) list3.get(0)).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)).intValue(), ((Integer) ((CoreLabel) list3.get(list3.size() - 1)).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)).intValue());
            }
        }
    }

    private Tokenizer getTokenizer(String str, String str2) throws AnalysisEngineProcessException {
        InternalTokenizerFactory internalTokenizerFactory = tokenizerFactories.get(str);
        if (internalTokenizerFactory == null) {
            if (this.languageFallback == null) {
                throw new AnalysisEngineProcessException(Messages.BUNDLE, Messages.ERR_UNSUPPORTED_LANGUAGE, new String[]{str});
            }
            internalTokenizerFactory = tokenizerFactories.get(this.languageFallback);
            if (internalTokenizerFactory == null) {
                throw new AnalysisEngineProcessException(Messages.BUNDLE, Messages.ERR_UNSUPPORTED_LANGUAGE, new String[]{this.languageFallback});
            }
        }
        return internalTokenizerFactory.create(str2);
    }

    static {
        tokenizerFactories.put(ArchiveStreamFactory.AR, new InternalArabicTokenizerFactory());
        tokenizerFactories.put("en", new InternalPTBTokenizerFactory());
        tokenizerFactories.put("es", new InternalSpanishTokenizerFactory());
        tokenizerFactories.put("fr", new InternalFrenchTokenizerFactory());
    }
}
