/*
 * Decompiled with CFR 0.152.
 */
package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;

import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Predicate;

public class BasicTokenizer {
    private final boolean isLowerCase;
    private final boolean isTokenizeCjkChars;
    private final boolean isStripAccents;
    private final Set<String> neverSplit;

    public BasicTokenizer(boolean isLowerCase, boolean isTokenizeCjkChars, boolean isStripAccents, Set<String> neverSplit) {
        this.isLowerCase = isLowerCase;
        this.isTokenizeCjkChars = isTokenizeCjkChars;
        this.isStripAccents = isStripAccents;
        this.neverSplit = neverSplit;
    }

    public BasicTokenizer(boolean isLowerCase, boolean isTokenizeCjkChars, boolean isStripAccents) {
        this.isLowerCase = isLowerCase;
        this.isTokenizeCjkChars = isTokenizeCjkChars;
        this.isStripAccents = isStripAccents;
        this.neverSplit = Collections.emptySet();
    }

    public BasicTokenizer(boolean isLowerCase, boolean isTokenizeCjkChars) {
        this(isLowerCase, isTokenizeCjkChars, isLowerCase);
    }

    BasicTokenizer() {
        this(true, true, true);
    }

    public List<String> tokenize(String text) {
        text = BasicTokenizer.cleanText(text);
        if (this.isTokenizeCjkChars) {
            text = BasicTokenizer.tokenizeCjkChars(text);
        }
        String[] tokens = BasicTokenizer.whiteSpaceTokenize(text);
        ArrayList<String> processedTokens = new ArrayList<String>(tokens.length);
        for (String token : tokens) {
            int lastNonPunctuationIndex;
            if ("".equals(token)) continue;
            if (this.neverSplit.contains(token)) {
                processedTokens.add(token);
                continue;
            }
            if (BasicTokenizer.isCommonPunctuation(token.codePointAt(token.length() - 1)) && (lastNonPunctuationIndex = this.findLastNonPunctuationIndex(token)) >= 0 && this.neverSplit.contains(token.substring(0, lastNonPunctuationIndex + 1))) {
                processedTokens.add(token.substring(0, lastNonPunctuationIndex + 1));
                processedTokens.addAll(BasicTokenizer.splitOnPunctuation(token.substring(lastNonPunctuationIndex + 1)));
                continue;
            }
            if (this.isLowerCase) {
                token = token.toLowerCase(Locale.ROOT);
            }
            if (this.isStripAccents) {
                token = BasicTokenizer.stripAccents(token);
            }
            processedTokens.addAll(BasicTokenizer.splitOnPunctuation(token));
        }
        return processedTokens;
    }

    private int findLastNonPunctuationIndex(String token) {
        int i;
        for (i = token.length() - 1; i >= 0 && BasicTokenizer.isCommonPunctuation(token.codePointAt(i)); --i) {
        }
        return i;
    }

    public boolean isLowerCase() {
        return this.isLowerCase;
    }

    public boolean isStripAccents() {
        return this.isStripAccents;
    }

    public boolean isTokenizeCjkChars() {
        return this.isTokenizeCjkChars;
    }

    static String[] whiteSpaceTokenize(String text) {
        text = text.trim();
        return text.split(" ");
    }

    static String stripAccents(String word) {
        String normalizedString = Normalizer.normalize(word, Normalizer.Form.NFD);
        int[] codePoints = normalizedString.codePoints().filter(codePoint -> Character.getType(codePoint) != 6).toArray();
        return new String(codePoints, 0, codePoints.length);
    }

    static List<String> splitOnPunctuation(String word) {
        return BasicTokenizer.splitOnPredicate(word, BasicTokenizer::isPunctuationMark);
    }

    static List<String> splitOnPredicate(String word, Predicate<Integer> test) {
        ArrayList<String> split = new ArrayList<String>();
        int[] codePoints = word.codePoints().toArray();
        int lastSplit = 0;
        for (int i = 0; i < codePoints.length; ++i) {
            if (!test.test(codePoints[i])) continue;
            int charCount = i - lastSplit;
            if (charCount > 0) {
                split.add(new String(codePoints, lastSplit, i - lastSplit));
            }
            split.add(new String(codePoints, i, 1));
            lastSplit = i + 1;
        }
        if (lastSplit < codePoints.length) {
            split.add(new String(codePoints, lastSplit, codePoints.length - lastSplit));
        }
        return split;
    }

    static String tokenizeCjkChars(String text) {
        StringBuilder sb = new StringBuilder(text.length());
        AtomicBoolean cjkCharFound = new AtomicBoolean(false);
        text.codePoints().forEach(cp -> {
            if (BasicTokenizer.isCjkChar(cp)) {
                sb.append(' ');
                sb.appendCodePoint(cp);
                sb.append(' ');
                cjkCharFound.set(true);
            } else {
                sb.appendCodePoint(cp);
            }
        });
        if (!cjkCharFound.get()) {
            return text;
        }
        return sb.toString();
    }

    static String cleanText(String text) {
        int[] codePoints = text.codePoints().filter(codePoint -> !(codePoint == 0 || codePoint == 65533 || BasicTokenizer.isControlChar(codePoint))).map(codePoint -> BasicTokenizer.isWhiteSpace(codePoint) ? 32 : codePoint).toArray();
        return new String(codePoints, 0, codePoints.length);
    }

    static boolean isCjkChar(int codePoint) {
        Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
        return Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E.equals(block) || Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT.equals(block);
    }

    static boolean isControlChar(int codePoint) {
        if (codePoint == 10 || codePoint == 13 || codePoint == 9) {
            return false;
        }
        int category = Character.getType(codePoint);
        return category >= 15 && category <= 19;
    }

    static boolean isWhiteSpace(int codePoint) {
        if (codePoint == 10 || codePoint == 13 || codePoint == 9) {
            return true;
        }
        return Character.getType(codePoint) == 12;
    }

    static boolean isPunctuationMark(int codePoint) {
        if (codePoint >= 33 && codePoint <= 47 || codePoint >= 58 && codePoint <= 64 || codePoint >= 91 && codePoint <= 96 || codePoint >= 123 && codePoint <= 126) {
            return true;
        }
        int category = Character.getType(codePoint);
        return category >= 20 && category <= 24 || category >= 29 && category <= 30;
    }

    static boolean isCommonPunctuation(int codePoint) {
        return codePoint >= 33 && codePoint <= 47 || codePoint >= 58 && codePoint <= 64;
    }
}

