package de.uniwue.dw.segmentation;

import de.uniwue.aries.uima.types.Types;
import de.uniwue.aries.uima.util.UIMAUtils;
import de.uniwue.dw.segmentation.abbreviations.Abbreviations;
import de.uniwue.dw.segmentation.tagging.POSTagging;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASRuntimeException;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;

/* loaded from: input_file:de/uniwue/dw/segmentation/SegmentationPhaseOne.class */
public class SegmentationPhaseOne {
    private static List<String> abbrevs = null;
    private static Pattern abbreviationsPattern = null;
    private static int maxAbbrevLength;

    public static void segmentizeLetter(Letter letter, boolean z) throws CASRuntimeException, IOException {
        segmentizeLetter(letter, null, z, null, true);
    }

    public static void segmentizeLetter(Letter letter, String str, boolean z, String str2, boolean z2) throws CASRuntimeException, IOException {
        if (letter.isSegmented()) {
            return;
        }
        CAS cas = letter.getCas();
        Type type = Types.getType(cas, Types.SECTION);
        if (z) {
            type = Types.getType(cas, Types.SECTIONG);
        }
        findAbbreviations(cas);
        findDates(cas);
        findNumbers(cas);
        findBlocks(cas);
        findSimpleLists(cas);
        findSimpleHearstPatterns(cas);
        Iterator it = cas.getAnnotationIndex(type).iterator();
        while (it.hasNext()) {
            AnnotationFS annotationFS = (AnnotationFS) it.next();
            SectionType typeForString = SectionType.getTypeForString(annotationFS.getFeatureValueAsString(type.getFeatureByBaseName("category")));
            findSegmentStops(cas, annotationFS, typeForString, str, str2, z2);
            filterWrongStops(cas);
            createSectionBodies(cas, annotationFS, typeForString);
        }
        Iterator it2 = cas.getAnnotationIndex(Types.getType(cas, Types.SECTIONBODY)).iterator();
        while (it2.hasNext()) {
            createSegments(cas, (AnnotationFS) it2.next(), z).forEach(annotationFS2 -> {
                cas.addFsToIndexes(annotationFS2);
            });
        }
        if (z2) {
            SegmentationPhaseTwo.segmentizeSegmentsWithoutVerbs(cas, z, str);
        }
        letter.setSegmented(true);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static void findBlocks(CAS cas) {
        Type type = Types.getType(cas, Types.BLOCK);
        Matcher matcher = Pattern.compile("(\\(.*?\\)|\\[.*?\\]|\\{.*?\\})").matcher(cas.getDocumentText());
        while (matcher.find()) {
            cas.addFsToIndexes(cas.createAnnotation(type, matcher.start(), matcher.end()));
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static List<AnnotationFS> createSegments(CAS cas, AnnotationFS annotationFS, boolean z) {
        int begin;
        LinkedList linkedList = new LinkedList();
        Type type = Types.getType(cas, Types.SEGMENT);
        if (z) {
            type = Types.getType(cas, Types.SEGMENTG);
        }
        Type type2 = Types.getType(cas, Types.SEGMENTSTOP);
        SectionType typeForString = SectionType.getTypeForString(annotationFS.getFeatureValueAsString(Types.getType(cas, Types.SECTIONBODY).getFeatureByBaseName("category")));
        int begin2 = annotationFS.getBegin();
        annotationFS.getBegin();
        Iterator it = cas.getAnnotationIndex(type2).iterator();
        while (it.hasNext()) {
            AnnotationFS annotationFS2 = (AnnotationFS) it.next();
            if (annotationFS2.getEnd() >= annotationFS.getBegin() && annotationFS2.getBegin() <= annotationFS.getEnd() && begin2 < (begin = annotationFS2.getBegin())) {
                AnnotationFS createAnnotation = cas.createAnnotation(type, begin2, begin);
                createAnnotation.setFeatureValueFromString(type.getFeatureByBaseName("category"), typeForString.getName());
                trimSegment(createAnnotation, type);
                if (!createAnnotation.getCoveredText().matches("(\\s*|;|:|,|\\.|!|\\?)")) {
                    linkedList.add(createAnnotation);
                }
                begin2 = annotationFS2.getEnd();
            }
        }
        if (begin2 < annotationFS.getEnd()) {
            AnnotationFS createAnnotation2 = cas.createAnnotation(type, begin2, annotationFS.getEnd());
            createAnnotation2.setFeatureValueFromString(type.getFeatureByBaseName("category"), typeForString.getName());
            trimSegment(createAnnotation2, type);
            if (!createAnnotation2.getCoveredText().matches("(\\s*|;|:|,|\\.|!|\\?)")) {
                linkedList.add(createAnnotation2);
            }
        }
        return linkedList;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static void trimSegment(AnnotationFS annotationFS, Type type) {
        if (annotationFS.getCoveredText().startsWith("\n") || annotationFS.getCoveredText().startsWith(" ")) {
            annotationFS.setFeatureValueFromString(type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_BEGIN), (annotationFS.getBegin() + 1) + "");
            trimSegment(annotationFS, type);
        }
        if (annotationFS.getCoveredText().endsWith("\n") || annotationFS.getCoveredText().endsWith(" ")) {
            annotationFS.setFeatureValueFromString(type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END), (annotationFS.getEnd() - 1) + "");
            trimSegment(annotationFS, type);
        }
        if (annotationFS.getCoveredText().matches("<.>.*") && !annotationFS.getCoveredText().matches(".*?</.>.*")) {
            annotationFS.setFeatureValueFromString(type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_BEGIN), (annotationFS.getBegin() + 3) + "");
            trimSegment(annotationFS, type);
        }
        if (annotationFS.getCoveredText().matches(".*</.>") && !annotationFS.getCoveredText().matches(".*?<.>.*")) {
            annotationFS.setFeatureValueFromString(type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END), (annotationFS.getEnd() - 4) + "");
            trimSegment(annotationFS, type);
        }
        if (annotationFS.getCoveredText().matches("</.>.*")) {
            annotationFS.setFeatureValueFromString(type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_BEGIN), (annotationFS.getBegin() + 4) + "");
            trimSegment(annotationFS, type);
        }
    }

    private static void createSectionBodies(CAS cas, AnnotationFS annotationFS, SectionType sectionType) {
        Type type = Types.getType(cas, Types.SECTIONBODY);
        Type type2 = Types.getType(cas, Types.HEADLINE);
        if (sectionType == null) {
            sectionType = SectionType.UNKNOWN;
        }
        List<AnnotationFS> covered = UIMAUtils.getCovered(cas, annotationFS, type2);
        AnnotationFS createAnnotation = (covered.size() <= 0 || covered.get(0).getEnd() >= annotationFS.getEnd()) ? cas.createAnnotation(type, annotationFS.getBegin(), annotationFS.getEnd()) : cas.createAnnotation(type, covered.get(0).getEnd() + 1, annotationFS.getEnd());
        while (true) {
            if (!createAnnotation.getCoveredText().startsWith("\n") && !createAnnotation.getCoveredText().startsWith(" ")) {
                createAnnotation.setFeatureValueFromString(type.getFeatureByBaseName("category"), sectionType.getName());
                cas.addFsToIndexes(createAnnotation);
                return;
            }
            createAnnotation.setFeatureValueFromString(type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_BEGIN), (createAnnotation.getBegin() + 1) + "");
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static void filterWrongStops(CAS cas) {
        Type type = Types.getType(cas, Types.SEGMENTSTOP);
        Type type2 = Types.getType(cas, Types.FLOAT);
        Type type3 = Types.getType(cas, Types.ABBREVIATION);
        Iterator it = cas.getAnnotationIndex(Types.getType(cas, Types.BLOCK)).iterator();
        while (it.hasNext()) {
            UIMAUtils.getCovered(cas, (AnnotationFS) it.next(), type).forEach(annotationFS -> {
                cas.removeFsFromIndexes(annotationFS);
            });
        }
        Iterator it2 = cas.getAnnotationIndex(type2).iterator();
        while (it2.hasNext()) {
            UIMAUtils.getCovered(cas, (AnnotationFS) it2.next(), type).forEach(annotationFS2 -> {
                cas.removeFsFromIndexes(annotationFS2);
            });
        }
        Iterator it3 = cas.getAnnotationIndex(type3).iterator();
        while (it3.hasNext()) {
            UIMAUtils.getCovered(cas, (AnnotationFS) it3.next(), type).forEach(annotationFS3 -> {
                cas.removeFsFromIndexes(annotationFS3);
            });
        }
    }

    private static void findSegmentStops(CAS cas, AnnotationFS annotationFS, SectionType sectionType, String str, String str2, boolean z) {
        String splitterForCategory;
        Type type = Types.getType(cas, Types.SEGMENTSTOP);
        if (z && POSTagging.containsVerb(annotationFS.getCoveredText()) && sectionType != SectionType.LABOR) {
            splitterForCategory = Constants.FALLBACKSPLITTER;
            if (str2 != null) {
                splitterForCategory = str2;
            }
        } else {
            splitterForCategory = (str == null || str.equals("") || sectionType != SectionType.DEFAULT) ? getSplitterForCategory(sectionType) : str;
        }
        Matcher matcher = Pattern.compile(splitterForCategory).matcher(annotationFS.getCoveredText());
        while (matcher.find()) {
            cas.addFsToIndexes(cas.createAnnotation(type, annotationFS.getBegin() + matcher.start(), annotationFS.getBegin() + matcher.end()));
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static void findNumbers(CAS cas) {
        Type type = Types.getType(cas, Types.FLOAT);
        Matcher matcher = Pattern.compile("\\d+(\\.|,)\\d+\\.?").matcher(cas.getDocumentText());
        while (matcher.find()) {
            cas.addFsToIndexes(cas.createAnnotation(type, matcher.start(), matcher.end()));
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static void findDates(CAS cas) {
        Type type = Types.getType(cas, Types.BLOCK);
        Matcher matcher = Pattern.compile("(\\d?\\d\\.\\d?\\d.(\\d\\d(\\d\\d)?)?|\\d?\\d\\.\\s(Januar|Februar|März|April|Mai|Juni|Juli|August|September|Oktober|November|Dezember)\\s\\d\\d(\\d\\d)?)").matcher(cas.getDocumentText());
        while (matcher.find()) {
            cas.addFsToIndexes(cas.createAnnotation(type, matcher.start(), matcher.end()));
        }
    }

    protected static void findSimpleLists(CAS cas) {
        Type type = Types.getType(cas, Types.BLOCK);
        Matcher matcher = Pattern.compile("[a-zäöüß]-,").matcher(cas.getDocumentText());
        while (matcher.find()) {
            cas.addFsToIndexes(cas.createAnnotation(type, matcher.start(), matcher.end()));
        }
    }

    protected static void findSimpleHearstPatterns(CAS cas) {
        Type type = Types.getType(cas, Types.BLOCK);
        Matcher matcher = Pattern.compile("[A-ZÄÖÜa-zäöüß](,[A-ZÄÖÜa-zäöüß\\s]+)+?(und|oder|u\\.|od?\\.) [A-ZÄÖÜa-zäöüß]+").matcher(cas.getDocumentText());
        while (matcher.find()) {
            cas.addFsToIndexes(cas.createAnnotation(type, matcher.start(), matcher.end()));
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static void findAbbreviations(CAS cas) throws IOException {
        Type type = Types.getType(cas, Types.ABBREVIATION);
        Matcher matcher = getAbbreviationPattern().matcher(cas.getDocumentText());
        while (matcher.find()) {
            cas.addFsToIndexes(cas.createAnnotation(type, matcher.start(), matcher.end()));
        }
    }

    private static Pattern getAbbreviationPattern() throws IOException {
        if (abbreviationsPattern == null) {
            if (abbrevs == null) {
                abbrevs = IOUtils.readLines(Abbreviations.class.getResourceAsStream("abbrevs_merged.txt"));
            }
            maxAbbrevLength = abbrevs.stream().mapToInt(str -> {
                return str.length();
            }).max().getAsInt();
            abbreviationsPattern = Pattern.compile("\\b(" + ((String) abbrevs.stream().sorted((str2, str3) -> {
                return Integer.compare(str3.length(), str2.length());
            }).map(str4 -> {
                return str4.substring(0, str4.length() - 1);
            }).collect(Collectors.joining("|"))) + ")\\.");
        }
        return abbreviationsPattern;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static String getSplitterForCategory(SectionType sectionType) {
        if (sectionType == null) {
            return "(\\.|\\n)";
        }
        switch (sectionType) {
            case ECHO:
                return "(\\.|,|;|\\n)";
            case DIAGNOSE:
                return "\\n";
            case SONOGRAFIE:
                return "(\\.|;|\\n)";
            case MEDIKATION:
                return "(\\n)";
            case LABOR:
                return "(\\.|,|;|\\n)";
            case HERZKATHETER:
                return "(\\.|,|;|\\n)";
            case ANAMNESE:
                return "(\\.|,|\\n)";
            case KU:
                return "(\\.|,|\\n)";
            case ROENTGEN_THORAX:
                return "(\\.|,|;)";
            default:
                return "(\\.|\\n|,)";
        }
    }
}
