package de.uniwue.aries.segmentation;

import de.uniwue.aries.AnnoHelper;
import de.uniwue.aries.sectionization.SectionType;
import de.uniwue.aries.uima.types.Types;
import de.uniwue.aries.uima.util.UIMAUtils;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import kotlin.Metadata;
import kotlin.collections.CollectionsKt;
import kotlin.collections.MapsKt;
import kotlin.comparisons.ComparisonsKt;
import kotlin.io.TextStreamsKt;
import kotlin.jvm.functions.Function1;
import kotlin.jvm.internal.Intrinsics;
import kotlin.text.Charsets;
import kotlin.text.Regex;
import kotlin.text.StringsKt;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.cas.text.AnnotationIndex;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;

/* compiled from: Segmentation.kt */
@Metadata(mv = {1, 5, 1}, k = 1, xi = 48, d1 = {"��D\n\u0002\u0018\u0002\n\u0002\u0010��\n\u0002\b\u0002\n\u0002\u0010\u000e\n��\n\u0002\u0018\u0002\n��\n\u0002\u0010\u0002\n��\n\u0002\u0018\u0002\n\u0002\b\u0002\n\u0002\u0018\u0002\n��\n\u0002\u0010 \n\u0002\u0018\u0002\n\u0002\b\n\n\u0002\u0010$\n\u0002\u0018\u0002\n\u0002\b\u0005\bÆ\u0002\u0018��2\u00020\u0001B\u0007\b\u0002¢\u0006\u0002\u0010\u0002J \u0010\u0007\u001a\u00020\b2\u0006\u0010\t\u001a\u00020\n2\u0006\u0010\u000b\u001a\u00020\u00042\u0006\u0010\f\u001a\u00020\rH\u0002J\u0016\u0010\u000e\u001a\b\u0012\u0004\u0012\u00020\u00100\u000f2\u0006\u0010\t\u001a\u00020\nH\u0002J\u0010\u0010\u0011\u001a\u00020\b2\u0006\u0010\t\u001a\u00020\nH\u0002J\u0015\u0010\u0012\u001a\u00020\b2\u0006\u0010\t\u001a\u00020\nH��¢\u0006\u0002\b\u0013J\u0010\u0010\u0014\u001a\u00020\b2\u0006\u0010\t\u001a\u00020\nH\u0002J\u0010\u0010\u0015\u001a\u00020\b2\u0006\u0010\t\u001a\u00020\nH\u0002J\u0010\u0010\u0016\u001a\u00020\b2\u0006\u0010\t\u001a\u00020\nH\u0002J\u0018\u0010\u0017\u001a\u00020\b2\u0006\u0010\t\u001a\u00020\n2\u0006\u0010\u0018\u001a\u00020\u0004H\u0002J$\u0010\u0019\u001a\u00020\b2\u0006\u0010\t\u001a\u00020\n2\u0012\u0010\u001a\u001a\u000e\u0012\u0004\u0012\u00020\u001c\u0012\u0004\u0012\u00020\u00040\u001bH\u0002J\n\u0010\u001d\u001a\u0004\u0018\u00010\u0006H\u0002J \u0010\u001e\u001a\u00020\b2\u0006\u0010\t\u001a\u00020\n2\u0006\u0010\f\u001a\u00020\r2\u0006\u0010\u001f\u001a\u00020\rH\u0002J.\u0010 \u001a\u00020\b2\u0006\u0010\t\u001a\u00020\n2\b\b\u0002\u0010\u0018\u001a\u00020\u00042\u0014\b\u0002\u0010\u001a\u001a\u000e\u0012\u0004\u0012\u00020\u001c\u0012\u0004\u0012\u00020\u00040\u001bR\u000e\u0010\u0003\u001a\u00020\u0004X\u0082T¢\u0006\u0002\n��R\u0010\u0010\u0005\u001a\u0004\u0018\u00010\u0006X\u0082\u000e¢\u0006\u0002\n��¨\u0006!"}, d2 = {"Lde/uniwue/aries/segmentation/Segmentation;", "", "()V", "MONTHLIST", "", "abbrevPatternCache", "Ljava/util/regex/Pattern;", "createBlockAnnotations", "", "cas", "Lorg/apache/uima/cas/CAS;", "regex", "tBlock", "Lorg/apache/uima/cas/Type;", "createSegments", "", "Lorg/apache/uima/cas/text/AnnotationFS;", "filterWrongStops", "findAbbreviations", "findAbbreviations$aries_segmentation", "findBlocks", "findDates", "findNumbers", "findSegmentStops", "splitter", "findSegmentStopsInSections", "section2splitter", "", "Lde/uniwue/aries/sectionization/SectionType;", "getAbbreviationPattern", "removeCoveredStops", "tStop", "segmentize", "aries-segmentation"})
/* loaded from: input_file:de/uniwue/aries/segmentation/Segmentation.class */
public final class Segmentation {

    @NotNull
    public static final Segmentation INSTANCE = new Segmentation();

    @NotNull
    private static final String MONTHLIST = "(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)";

    @Nullable
    private static Pattern abbrevPatternCache;

    private Segmentation() {
    }

    public final void segmentize(@NotNull CAS cas, @NotNull String splitter, @NotNull Map<SectionType, String> section2splitter) throws IOException {
        Intrinsics.checkNotNullParameter(cas, "cas");
        Intrinsics.checkNotNullParameter(splitter, "splitter");
        Intrinsics.checkNotNullParameter(section2splitter, "section2splitter");
        findAbbreviations$aries_segmentation(cas);
        findDates(cas);
        findNumbers(cas);
        findBlocks(cas);
        findSegmentStops(cas, splitter);
        if (!section2splitter.isEmpty()) {
            findSegmentStopsInSections(cas, section2splitter);
        }
        filterWrongStops(cas);
        createSegments(cas).forEach((v1) -> {
            m840segmentize$lambda0(r1, v1);
        });
    }

    public static /* synthetic */ void segmentize$default(Segmentation segmentation, CAS cas, String str, Map map, int i, Object obj) throws IOException {
        if ((i & 2) != 0) {
            str = "\\n|\\.";
        }
        if ((i & 4) != 0) {
            map = MapsKt.emptyMap();
        }
        segmentation.segmentize(cas, str, map);
    }

    private final void findSegmentStops(CAS cas, String str) {
        Type type = Types.getType(cas, Types.SEGMENTSTOP);
        Matcher matcher = Pattern.compile(str).matcher(cas.getDocumentText());
        while (matcher.find()) {
            cas.addFsToIndexes(cas.createAnnotation(type, matcher.start(), matcher.end()));
        }
    }

    private final void findSegmentStopsInSections(CAS cas, Map<SectionType, String> map) {
        Type type = Types.getType(cas, Types.SEGMENTSTOP);
        Type type2 = Types.getType(cas, Types.SECTION);
        AnnotationIndex<AnnotationFS> annotationIndex = cas.getAnnotationIndex(type2);
        Intrinsics.checkNotNullExpressionValue(annotationIndex, "cas.getAnnotationIndex<AnnotationFS>(tSection)");
        for (AnnotationFS annotationFS : annotationIndex) {
            String featureValueAsString = annotationFS.getFeatureValueAsString(type2.getFeatureByBaseName("category"));
            Intrinsics.checkNotNullExpressionValue(featureValueAsString, "it.getFeatureValueAsString(tSection.getFeatureByBaseName(\"category\"))");
            SectionType valueOf = SectionType.valueOf(featureValueAsString);
            if (map.containsKey(valueOf)) {
                List<AnnotationFS> covered = UIMAUtils.getCovered(cas, annotationFS, type);
                Intrinsics.checkNotNullExpressionValue(covered, "getCovered(cas, it, tSegmentStop)");
                Iterator<T> it = covered.iterator();
                while (it.hasNext()) {
                    cas.removeFsFromIndexes((AnnotationFS) it.next());
                }
                String str = map.get(valueOf);
                Intrinsics.checkNotNull(str);
                Matcher matcher = Pattern.compile(str).matcher(annotationFS.getCoveredText());
                while (matcher.find()) {
                    cas.addFsToIndexes(cas.createAnnotation(type, annotationFS.getBegin() + matcher.start(), annotationFS.getBegin() + matcher.end()));
                }
            }
        }
    }

    private final List<AnnotationFS> createSegments(CAS cas) {
        ArrayList arrayList = new ArrayList();
        Type tSegment = Types.getType(cas, Types.SEGMENT);
        int i = 0;
        FSIterator<T> it = cas.getAnnotationIndex(Types.getType(cas, Types.SEGMENTSTOP)).iterator();
        while (it.hasNext()) {
            AnnotationFS annotationFS = (AnnotationFS) it.next();
            int begin = annotationFS.getBegin();
            if (i < begin) {
                AnnotationFS segment = cas.createAnnotation(tSegment, i, begin);
                AnnoHelper annoHelper = AnnoHelper.INSTANCE;
                Intrinsics.checkNotNullExpressionValue(segment, "segment");
                Intrinsics.checkNotNullExpressionValue(tSegment, "tSegment");
                annoHelper.trimAnnotation(segment, tSegment);
                String coveredText = segment.getCoveredText();
                Intrinsics.checkNotNullExpressionValue(coveredText, "segment.coveredText");
                if (!new Regex("(\\s*|;|:|,|\\.|!|\\?)").matches(coveredText)) {
                    arrayList.add(segment);
                }
                i = annotationFS.getEnd();
            }
        }
        if (i < cas.getDocumentText().length()) {
            AnnotationFS segment2 = cas.createAnnotation(tSegment, i, cas.getDocumentText().length());
            AnnoHelper annoHelper2 = AnnoHelper.INSTANCE;
            Intrinsics.checkNotNullExpressionValue(segment2, "segment");
            Intrinsics.checkNotNullExpressionValue(tSegment, "tSegment");
            annoHelper2.trimAnnotation(segment2, tSegment);
            String coveredText2 = segment2.getCoveredText();
            Intrinsics.checkNotNullExpressionValue(coveredText2, "segment.coveredText");
            if (!new Regex("(\\s*|;|:|,|\\.|!|\\?)").matches(coveredText2)) {
                arrayList.add(segment2);
            }
        }
        return arrayList;
    }

    private final void findBlocks(CAS cas) {
        Type tBlock = Types.getType(cas, Types.BLOCK);
        Intrinsics.checkNotNullExpressionValue(tBlock, "tBlock");
        createBlockAnnotations(cas, "(\\(.*?\\)|\\[.*?\\]|\\{.*?\\})", tBlock);
    }

    private final void filterWrongStops(CAS cas) {
        Type tStop = Types.getType(cas, Types.SEGMENTSTOP);
        Type tFloat = Types.getType(cas, Types.FLOAT);
        Type tAbbrev = Types.getType(cas, Types.ABBREVIATION);
        Type tBlock = Types.getType(cas, Types.BLOCK);
        Intrinsics.checkNotNullExpressionValue(tBlock, "tBlock");
        Intrinsics.checkNotNullExpressionValue(tStop, "tStop");
        removeCoveredStops(cas, tBlock, tStop);
        Intrinsics.checkNotNullExpressionValue(tFloat, "tFloat");
        removeCoveredStops(cas, tFloat, tStop);
        Intrinsics.checkNotNullExpressionValue(tAbbrev, "tAbbrev");
        removeCoveredStops(cas, tAbbrev, tStop);
    }

    private final void removeCoveredStops(CAS cas, Type type, Type type2) {
        AnnotationIndex annotationIndex = cas.getAnnotationIndex(type);
        Intrinsics.checkNotNullExpressionValue(annotationIndex, "cas.getAnnotationIndex<AnnotationFS>(tBlock)");
        Iterator<T> it = annotationIndex.iterator();
        while (it.hasNext()) {
            List<AnnotationFS> covered = UIMAUtils.getCovered(cas, (AnnotationFS) it.next(), type2);
            Intrinsics.checkNotNullExpressionValue(covered, "getCovered(cas, it, tStop)");
            Iterator<T> it2 = covered.iterator();
            while (it2.hasNext()) {
                cas.removeFsFromIndexes((AnnotationFS) it2.next());
            }
        }
    }

    private final void findNumbers(CAS cas) {
        Type tFloat = Types.getType(cas, Types.FLOAT);
        Intrinsics.checkNotNullExpressionValue(tFloat, "tFloat");
        createBlockAnnotations(cas, "\\d+(\\.|,)\\d+\\.?", tFloat);
    }

    private final void findDates(CAS cas) {
        Type tBlock = Types.getType(cas, Types.BLOCK);
        Intrinsics.checkNotNullExpressionValue(tBlock, "tBlock");
        createBlockAnnotations(cas, "(\\d?\\d\\.\\d?\\d.(\\d\\d(\\d\\d)?)?|\\d?\\d\\.\\s(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\\s\\d\\d(\\d\\d)?)", tBlock);
    }

    private final void createBlockAnnotations(CAS cas, String str, Type type) {
        Matcher matcher = Pattern.compile(str).matcher(cas.getDocumentText());
        while (matcher.find()) {
            AnnotationFS createAnnotation = cas.createAnnotation(type, matcher.start(), matcher.end());
            Intrinsics.checkNotNullExpressionValue(createAnnotation, "cas.createAnnotation<AnnotationFS>(tBlock, matcher.start(), matcher.end())");
            cas.addFsToIndexes(createAnnotation);
        }
    }

    public final void findAbbreviations$aries_segmentation(@NotNull CAS cas) throws IOException {
        Intrinsics.checkNotNullParameter(cas, "cas");
        Type type = Types.getType(cas, Types.ABBREVIATION);
        Pattern abbreviationPattern = getAbbreviationPattern();
        String documentText = cas.getDocumentText();
        Intrinsics.checkNotNullExpressionValue(documentText, "cas.documentText");
        Intrinsics.checkNotNull(abbreviationPattern);
        Matcher matcher = abbreviationPattern.matcher(documentText);
        while (matcher.find()) {
            cas.addFsToIndexes(cas.createAnnotation(type, matcher.start(), matcher.end()));
        }
    }

    private final Pattern getAbbreviationPattern() {
        if (abbrevPatternCache != null) {
            return abbrevPatternCache;
        }
        URL resource = getClass().getClassLoader().getResource("abbrevs_merged.txt");
        Intrinsics.checkNotNull(resource);
        List split$default = StringsKt.split$default((CharSequence) new String(TextStreamsKt.readBytes(resource), Charsets.UTF_8), new String[]{"\n"}, false, 0, 6, (Object) null);
        ArrayList arrayList = new ArrayList();
        for (Object obj : split$default) {
            if (((String) obj).length() > 1) {
                arrayList.add(obj);
            }
        }
        abbrevPatternCache = Pattern.compile("\\b(" + CollectionsKt.joinToString$default(CollectionsKt.sortedWith(arrayList, new Comparator<T>() { // from class: de.uniwue.aries.segmentation.Segmentation$getAbbreviationPattern$$inlined$sortedBy$1
            /* JADX WARN: Multi-variable type inference failed */
            @Override // java.util.Comparator
            public final int compare(T t, T t2) {
                return ComparisonsKt.compareValues(Integer.valueOf(((String) t).length()), Integer.valueOf(((String) t2).length()));
            }
        }), "|", null, null, 0, null, new Function1<String, CharSequence>() { // from class: de.uniwue.aries.segmentation.Segmentation$getAbbreviationPattern$orRegex$2
            @Override // kotlin.jvm.functions.Function1
            @NotNull
            public final CharSequence invoke(@NotNull String it) {
                Intrinsics.checkNotNullParameter(it, "it");
                String substring = it.substring(0, it.length() - 1);
                Intrinsics.checkNotNullExpressionValue(substring, "(this as java.lang.Strin…ing(startIndex, endIndex)");
                return substring;
            }
        }, 30, null) + ")\\.");
        return abbrevPatternCache;
    }

    /* renamed from: segmentize$lambda-0, reason: not valid java name */
    private static final void m840segmentize$lambda0(CAS cas, AnnotationFS annotationFS) {
        Intrinsics.checkNotNullParameter(cas, "$cas");
        cas.addFsToIndexes(annotationFS);
    }
}
