package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.ComplexNodePattern;
import edu.stanford.nlp.ling.tokensregex.CoreMapNodePattern;
import edu.stanford.nlp.ling.tokensregex.Env;
import edu.stanford.nlp.ling.tokensregex.EnvLookup;
import edu.stanford.nlp.ling.tokensregex.MultiPatternMatcher;
import edu.stanford.nlp.ling.tokensregex.SequenceMatchResult;
import edu.stanford.nlp.ling.tokensregex.SequencePattern;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.ling.tokensregex.matcher.TrieMap;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.xalan.templates.Constants;
import org.codehaus.plexus.util.SelectorUtils;

/* loaded from: input_file:edu/stanford/nlp/pipeline/TokensRegexNERAnnotator.class */
public class TokensRegexNERAnnotator implements Annotator {
    protected static final String PRIORITY_FIELD = "priority";
    private final boolean ignoreCase;
    private final Set<String> commonWords;
    private final List<Entry> entries;
    private final Map<SequencePattern<CoreMap>, Entry> patternToEntry;
    private final MultiPatternMatcher<CoreMap> multiPatternMatcher;
    private final List<Class> annotationFields;
    private final Set<String> myLabels;
    private final Pattern validPosPattern;
    private final boolean verbose;
    private final Set<String> noDefaultOverwriteLabels;
    private final PosMatchType posMatchType;
    protected static final Redwood.RedwoodChannels logger = Redwood.channels("TokenRegexNER");
    protected static final String PATTERN_FIELD = "pattern";
    protected static final String OVERWRITE_FIELD = "overwrite";
    protected static final String WEIGHT_FIELD = "weight";
    protected static final String GROUP_FIELD = "group";
    protected static final Set<String> predefinedHeaderFields = CollectionUtils.asSet(PATTERN_FIELD, OVERWRITE_FIELD, "priority", WEIGHT_FIELD, GROUP_FIELD);
    public static final PosMatchType DEFAULT_POS_MATCH_TYPE = PosMatchType.MATCH_AT_LEAST_ONE_TOKEN;
    protected static final String defaultHeader = "pattern,ner,overwrite,priority,group";
    public static final String DEFAULT_BACKGROUND_SYMBOL = "O,MISC";
    public static PropertiesUtils.Property[] SUPPORTED_PROPERTIES = {new PropertiesUtils.Property("mapping", DefaultPaths.DEFAULT_REGEXNER_RULES, "Comma separated list of mapping files to use."), new PropertiesUtils.Property("mapping.header", defaultHeader, "Comma separated list specifying order of fields in the mapping file"), new PropertiesUtils.Property("mapping.field.<fieldname>", "", "Class mapping for annotation fields other than ner"), new PropertiesUtils.Property("commonWords", "", "Comma separated list of files for common words to not annotate (in case your mapping isn't very clean)"), new PropertiesUtils.Property("ignorecase", "false", "Whether to ignore case or not when matching patterns."), new PropertiesUtils.Property("validpospattern", "", "Regular expression pattern for matching POS tags."), new PropertiesUtils.Property("posmatchtype", DEFAULT_POS_MATCH_TYPE.name(), "How should 'validpospattern' be used to match the POS of the tokens."), new PropertiesUtils.Property("noDefaultOverwriteLabels", "", "Comma separated list of output types for which default NER labels are not overwritten.\n For these types, only if the matched expression has NER type matching the\n specified overwriteableType for the regex will the NER type be overwritten."), new PropertiesUtils.Property("backgroundSymbol", DEFAULT_BACKGROUND_SYMBOL, "Comma separated list of NER labels to always replace."), new PropertiesUtils.Property("verbose", "false", "")};
    private static Pattern FILE_DELIMITERS_PATTERN = Pattern.compile("\\s*[,;]\\s*");
    private static Pattern COMMA_DELIMITERS_PATTERN = Pattern.compile("\\s*,\\s*");

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/stanford/nlp/pipeline/TokensRegexNERAnnotator$Entry.class */
    public static class Entry {
        public final String tokensRegex;
        public final String[] regex;
        public final String[] types;
        public final Set<String> overwritableTypes;
        public final double priority;
        public final double weight;
        public final int annotateGroup;

        public Entry(String str, String[] strArr, String[] strArr2, Set<String> set, double d, double d2, int i) {
            this.tokensRegex = str;
            this.regex = strArr;
            this.types = new String[strArr2.length];
            for (int i2 = 0; i2 < strArr2.length; i2++) {
                this.types[i2] = strArr2[i2].intern();
            }
            this.overwritableTypes = set;
            this.priority = d;
            this.weight = d2;
            this.annotateGroup = i;
        }

        public String getTypeDescription() {
            return SelectorUtils.PATTERN_HANDLER_PREFIX + StringUtils.join(this.types, ",") + SelectorUtils.PATTERN_HANDLER_SUFFIX;
        }

        public String toString() {
            return "Entry{" + (this.tokensRegex != null ? this.tokensRegex : StringUtils.join(this.regex)) + ' ' + StringUtils.join(this.types) + ' ' + this.overwritableTypes + ' ' + this.priority + '}';
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:edu/stanford/nlp/pipeline/TokensRegexNERAnnotator$PosMatchType.class */
    public enum PosMatchType {
        MATCH_ALL_TOKENS,
        MATCH_AT_LEAST_ONE_TOKEN,
        MATCH_ONE_TOKEN_PHRASE_ONLY
    }

    public TokensRegexNERAnnotator(String str) {
        this(str, false);
    }

    public TokensRegexNERAnnotator(String str, boolean z) {
        this(str, z, null);
    }

    public TokensRegexNERAnnotator(String str, boolean z, String str2) {
        this("tokenregexner", getProperties("tokenregexner", str, z, str2));
    }

    private static Properties getProperties(String str, String str2, boolean z, String str3) {
        String str4 = (str == null || str.isEmpty()) ? "" : str + Constants.ATTRVAL_THIS;
        Properties properties = new Properties();
        properties.setProperty(str4 + "mapping", str2);
        properties.setProperty(str4 + "ignorecase", String.valueOf(z));
        if (str3 != null) {
            properties.setProperty(str4 + "validpospattern", str3);
        }
        return properties;
    }

    public TokensRegexNERAnnotator(String str, Properties properties) {
        String str2 = (str == null || str.isEmpty()) ? "" : str + Constants.ATTRVAL_THIS;
        String[] split = COMMA_DELIMITERS_PATTERN.split(properties.getProperty(str2 + "backgroundSymbol", DEFAULT_BACKGROUND_SYMBOL));
        String[] split2 = FILE_DELIMITERS_PATTERN.split(properties.getProperty(str2 + "mapping", DefaultPaths.DEFAULT_REGEXNER_RULES));
        String property = properties.getProperty(str2 + "validpospattern");
        this.posMatchType = PosMatchType.valueOf(properties.getProperty(str2 + "posmatchtype", DEFAULT_POS_MATCH_TYPE.name()));
        String property2 = properties.getProperty(str2 + "commonWords");
        this.commonWords = new HashSet();
        if (property2 != null) {
            try {
                BufferedReader bufferedFileReader = IOUtils.getBufferedFileReader(property2);
                while (true) {
                    String readLine = bufferedFileReader.readLine();
                    if (readLine == null) {
                        break;
                    } else {
                        this.commonWords.add(readLine);
                    }
                }
                bufferedFileReader.close();
            } catch (IOException e) {
                throw new RuntimeException("TokensRegexNERAnnotator " + str + ": Error opening the common words file: " + property2, e);
            }
        }
        String property3 = properties.getProperty(str2 + "mapping.header", defaultHeader);
        if (property3.equalsIgnoreCase("true")) {
            throw new UnsupportedOperationException("Reading header from file not yet supported!!!");
        }
        String[] split3 = COMMA_DELIMITERS_PATTERN.split(property3);
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        for (String str3 : split3) {
            if (!predefinedHeaderFields.contains(str3)) {
                Class lookupAnnotationKeyWithClassname = EnvLookup.lookupAnnotationKeyWithClassname(null, str3);
                lookupAnnotationKeyWithClassname = lookupAnnotationKeyWithClassname == null ? EnvLookup.lookupAnnotationKeyWithClassname(null, properties.getProperty(str2 + "mapping.field." + str3)) : lookupAnnotationKeyWithClassname;
                if (lookupAnnotationKeyWithClassname != null) {
                    arrayList.add(str3);
                    arrayList2.add(lookupAnnotationKeyWithClassname);
                } else {
                    logger.warn("TokensRegexNERAnnotator " + str + ": Unknown field: " + str3 + " cannot find suitable annotation class");
                }
            }
        }
        String[] strArr = new String[arrayList.size()];
        arrayList.toArray(strArr);
        this.annotationFields = arrayList2;
        String property4 = properties.getProperty(str2 + "noDefaultOverwriteLabels");
        this.noDefaultOverwriteLabels = property4 != null ? Collections.unmodifiableSet(CollectionUtils.asSet(property4.split("\\s*,\\s*"))) : Collections.unmodifiableSet(new HashSet());
        this.ignoreCase = PropertiesUtils.getBool(properties, str2 + "ignorecase", false);
        this.verbose = PropertiesUtils.getBool(properties, str2 + "verbose", false);
        if (property == null || property.equals("")) {
            this.validPosPattern = null;
        } else {
            this.validPosPattern = Pattern.compile(property);
        }
        this.entries = Collections.unmodifiableList(readEntries(str, this.noDefaultOverwriteLabels, this.ignoreCase, this.verbose, split3, strArr, split2));
        IdentityHashMap identityHashMap = new IdentityHashMap();
        this.multiPatternMatcher = createPatternMatcher(identityHashMap);
        this.patternToEntry = Collections.unmodifiableMap(identityHashMap);
        Set newHashSet = Generics.newHashSet();
        Collections.addAll(newHashSet, split);
        newHashSet.add(null);
        Iterator<Entry> it = this.entries.iterator();
        while (it.hasNext()) {
            for (String str4 : it.next().types) {
                newHashSet.add(str4);
            }
        }
        this.myLabels = Collections.unmodifiableSet(newHashSet);
    }

    @Override // edu.stanford.nlp.pipeline.Annotator
    public void annotate(Annotation annotation) {
        if (this.verbose) {
            System.err.print("Adding TokensRegexNER annotations ... ");
        }
        List list = (List) annotation.get(CoreAnnotations.SentencesAnnotation.class);
        if (list != null) {
            Iterator it = list.iterator();
            while (it.hasNext()) {
                annotateMatched((List) ((CoreMap) it.next()).get(CoreAnnotations.TokensAnnotation.class));
            }
        } else {
            List<CoreLabel> list2 = (List) annotation.get(CoreAnnotations.TokensAnnotation.class);
            if (list2 == null) {
                throw new RuntimeException("Unable to find sentences or tokens in " + annotation);
            }
            annotateMatched(list2);
        }
        if (this.verbose) {
            System.err.println("done.");
        }
    }

    private MultiPatternMatcher<CoreMap> createPatternMatcher(Map<SequencePattern<CoreMap>, Entry> map) {
        TokenSequencePattern compile;
        int i = this.ignoreCase ? 2 : 0;
        int i2 = this.ignoreCase ? 2 : 0;
        Env newEnv = TokenSequencePattern.getNewEnv();
        newEnv.setDefaultStringPatternFlags(i);
        newEnv.setDefaultStringMatchFlags(i2);
        ComplexNodePattern.StringAnnotationRegexPattern stringAnnotationRegexPattern = (this.validPosPattern == null || !PosMatchType.MATCH_ALL_TOKENS.equals(this.posMatchType)) ? null : new ComplexNodePattern.StringAnnotationRegexPattern(this.validPosPattern);
        ArrayList arrayList = new ArrayList(this.entries.size());
        for (Entry entry : this.entries) {
            if (entry.tokensRegex != null) {
                compile = TokenSequencePattern.compile(newEnv, entry.tokensRegex);
            } else {
                ArrayList arrayList2 = new ArrayList();
                for (String str : entry.regex) {
                    CoreMapNodePattern valueOf = CoreMapNodePattern.valueOf(str, i);
                    if (stringAnnotationRegexPattern != null) {
                        valueOf.add(CoreAnnotations.PartOfSpeechAnnotation.class, stringAnnotationRegexPattern);
                    }
                    arrayList2.add(new SequencePattern.NodePatternExpr(valueOf));
                }
                compile = TokenSequencePattern.compile((SequencePattern.PatternExpr) new SequencePattern.SequencePatternExpr(arrayList2));
            }
            if (entry.annotateGroup < 0 || entry.annotateGroup > compile.getTotalGroups()) {
                throw new RuntimeException("Invalid match group for entry " + entry);
            }
            compile.setPriority(entry.priority);
            compile.setWeight(entry.weight);
            arrayList.add(compile);
            map.put(compile, entry);
        }
        return TokenSequencePattern.getMultiPatternMatcher(arrayList);
    }

    private void annotateMatched(List<CoreLabel> list) {
        for (SequenceMatchResult<CoreMap> sequenceMatchResult : this.multiPatternMatcher.findNonOverlapping(list)) {
            Entry entry = this.patternToEntry.get(sequenceMatchResult.pattern());
            int i = entry.annotateGroup;
            int start = sequenceMatchResult.start(i);
            int end = sequenceMatchResult.end(i);
            String group = sequenceMatchResult.group(i);
            if (!this.commonWords.contains(group)) {
                boolean checkPosTags = checkPosTags(list, start, end);
                if (checkPosTags) {
                    checkPosTags = checkOrigNerTags(entry, list, start, end);
                }
                if (checkPosTags) {
                    for (int i2 = start; i2 < end; i2++) {
                        CoreLabel coreLabel = list.get(i2);
                        for (int i3 = 0; i3 < this.annotationFields.size(); i3++) {
                            coreLabel.set(this.annotationFields.get(i3), entry.types[i3]);
                        }
                    }
                } else if (this.verbose) {
                    System.err.println("Not annotating  '" + sequenceMatchResult.group(i) + "': " + StringUtils.joinFields(sequenceMatchResult.groupNodes(i), CoreAnnotations.NamedEntityTagAnnotation.class) + " with " + entry.getTypeDescription() + ", sentence is '" + StringUtils.joinWords(list, " ") + "'");
                }
            } else if (this.verbose) {
                System.err.println("Not annotating (common word) '" + group + "': " + StringUtils.joinFields(sequenceMatchResult.groupNodes(i), CoreAnnotations.NamedEntityTagAnnotation.class) + " with " + entry.getTypeDescription() + ", sentence is '" + StringUtils.joinWords(list, " ") + "'");
            }
        }
    }

    private boolean checkPosTags(List<CoreLabel> list, int i, int i2) {
        if (this.validPosPattern == null) {
            return true;
        }
        switch (this.posMatchType) {
            case MATCH_ONE_TOKEN_PHRASE_ONLY:
                if (list.size() > 1) {
                    return true;
                }
                break;
            case MATCH_AT_LEAST_ONE_TOKEN:
                break;
            case MATCH_ALL_TOKENS:
                return true;
            default:
                return true;
        }
        for (int i3 = i; i3 < i2; i3++) {
            String str = (String) list.get(i3).get(CoreAnnotations.PartOfSpeechAnnotation.class);
            if (str != null && this.validPosPattern.matcher(str).matches()) {
                return true;
            }
        }
        return false;
    }

    private boolean checkOrigNerTags(Entry entry, List<CoreLabel> list, int i, int i2) {
        String ner;
        String ner2;
        int i3 = i - 1;
        int i4 = i2;
        String ner3 = list.get(i).ner();
        String ner4 = list.get(i2 - 1).ner();
        if (ner3 != null && !this.myLabels.contains(ner3)) {
            while (i3 >= 0 && (ner2 = list.get(i3).ner()) != null && ner2.equals(ner3)) {
                i3--;
            }
        }
        if (ner4 != null && !this.myLabels.contains(ner4)) {
            while (i4 < list.size() && (ner = list.get(i4).ner()) != null && ner.equals(ner4)) {
                i4++;
            }
        }
        boolean z = false;
        if (i3 == i - 1 && i4 == i2) {
            if (ner3 == null) {
                z = true;
            } else {
                int i5 = i + 1;
                while (true) {
                    if (i5 >= i2) {
                        break;
                    }
                    if (!ner3.equals(list.get(i5).ner())) {
                        z = true;
                        break;
                    }
                    i5++;
                }
                if (!z) {
                    if (entry.overwritableTypes.contains(ner3)) {
                        z = true;
                    } else if (!hasNoOverwritableType(this.noDefaultOverwriteLabels, entry.types)) {
                        z = this.myLabels.contains(ner3);
                    }
                }
            }
        }
        return z;
    }

    private static List<Entry> readEntries(String str, Set<String> set, boolean z, boolean z2, String[] strArr, String[] strArr2, String... strArr3) {
        ArrayList arrayList = new ArrayList();
        TrieMap trieMap = new TrieMap();
        Arrays.sort(strArr3);
        for (String str2 : strArr3) {
            BufferedReader bufferedReader = null;
            try {
                try {
                    bufferedReader = IOUtils.readerFromString(str2);
                    readEntries(str, strArr, strArr2, arrayList, trieMap, str2, bufferedReader, set, z, z2);
                    IOUtils.closeIgnoringExceptions(bufferedReader);
                } catch (IOException e) {
                    throw new RuntimeIOException("Couldn't read TokensRegexNER from " + str2, e);
                }
            } catch (Throwable th) {
                IOUtils.closeIgnoringExceptions(bufferedReader);
                throw th;
            }
        }
        if (strArr3.length != 1) {
            logger.log("TokensRegexNERAnnotator " + str + ": Read " + arrayList.size() + " unique entries from " + strArr3.length + " files");
        }
        return arrayList;
    }

    private static Map<String, Integer> getHeaderIndexMap(String[] strArr) {
        HashMap hashMap = new HashMap();
        for (int i = 0; i < strArr.length; i++) {
            String str = strArr[i];
            if (hashMap.containsKey(str)) {
                throw new IllegalArgumentException("Duplicate header field: " + str);
            }
            hashMap.put(str, Integer.valueOf(i));
        }
        return hashMap;
    }

    private static int getIndex(Map<String, Integer> map, String str) {
        Integer num = map.get(str);
        if (num == null) {
            return -1;
        }
        return num.intValue();
    }

    /* JADX WARN: Code restructure failed: missing block: B:129:0x0142, code lost:
    
        throw new java.lang.IllegalArgumentException("TokensRegexNERAnnotator " + r12 + " ERROR: Provided mapping file is in wrong format. Line " + r24 + " is bad: " + r0);
     */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    private static java.util.List<edu.stanford.nlp.pipeline.TokensRegexNERAnnotator.Entry> readEntries(java.lang.String r12, java.lang.String[] r13, java.lang.String[] r14, java.util.List<edu.stanford.nlp.pipeline.TokensRegexNERAnnotator.Entry> r15, edu.stanford.nlp.ling.tokensregex.matcher.TrieMap<java.lang.String, edu.stanford.nlp.pipeline.TokensRegexNERAnnotator.Entry> r16, java.lang.String r17, java.io.BufferedReader r18, java.util.Set<java.lang.String> r19, boolean r20, boolean r21) throws java.io.IOException {
        /*
            Method dump skipped, instructions count: 1362
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: edu.stanford.nlp.pipeline.TokensRegexNERAnnotator.readEntries(java.lang.String, java.lang.String[], java.lang.String[], java.util.List, edu.stanford.nlp.ling.tokensregex.matcher.TrieMap, java.lang.String, java.io.BufferedReader, java.util.Set, boolean, boolean):java.util.List");
    }

    private static boolean hasNoOverwritableType(Set<String> set, String[] strArr) {
        for (String str : strArr) {
            if (set.contains(str)) {
                return true;
            }
        }
        return false;
    }

    @Override // edu.stanford.nlp.pipeline.Annotator
    public Set<Annotator.Requirement> requires() {
        return StanfordCoreNLP.TOKENIZE_AND_SSPLIT;
    }

    @Override // edu.stanford.nlp.pipeline.Annotator
    public Set<Annotator.Requirement> requirementsSatisfied() {
        return Collections.emptySet();
    }
}
