package org.deeplearning4j.text.tokenization.tokenizer;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/deeplearning4j/text/tokenization/tokenizer/BertWordPieceTokenizer.class */
public class BertWordPieceTokenizer implements Tokenizer {
    private static final Logger log = LoggerFactory.getLogger(BertWordPieceTokenizer.class);
    public static final Pattern splitPattern = Pattern.compile("\\p{javaWhitespace}+|((?<=\\p{Punct})+|(?=\\p{Punct}+))|((?<=[\\x21-\\x2F])+|(?=[\\x21-\\x2F]+))|((?<=[\\x3A-\\x40])+|(?=[\\x3A-\\x40]+))|((?<=[\\x5B-\\x60])+|(?=[\\x5B-\\x60]+))|((?<=[\\x7B-\\x7E])+|(?=[\\x7B-\\x7E]+))", 256);
    private final List<String> tokens;
    private final TokenPreProcess preTokenizePreProcessor;
    private TokenPreProcess tokenPreProcess;
    private final AtomicInteger cursor = new AtomicInteger(0);

    public BertWordPieceTokenizer(String str, NavigableMap<String, Integer> navigableMap, TokenPreProcess tokenPreProcess, TokenPreProcess tokenPreProcess2) {
        if (navigableMap.comparator() == null || navigableMap.comparator().compare("a", "b") < 0) {
            throw new IllegalArgumentException("Vocab must use reverse sort order!");
        }
        this.preTokenizePreProcessor = tokenPreProcess;
        this.tokenPreProcess = tokenPreProcess2;
        this.tokens = tokenize(navigableMap, str);
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.Tokenizer
    public boolean hasMoreTokens() {
        return this.cursor.get() < this.tokens.size();
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.Tokenizer
    public int countTokens() {
        return this.tokens.size();
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.Tokenizer
    public String nextToken() {
        String str = this.tokens.get(this.cursor.getAndIncrement());
        if (this.tokenPreProcess != null) {
            str = this.tokenPreProcess.preProcess(str);
        }
        return str;
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.Tokenizer
    public List<String> getTokens() {
        if (this.tokenPreProcess == null) {
            return this.tokens;
        }
        ArrayList arrayList = new ArrayList(this.tokens.size());
        Iterator<String> it = this.tokens.iterator();
        while (it.hasNext()) {
            arrayList.add(this.tokenPreProcess.preProcess(it.next()));
        }
        return arrayList;
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.Tokenizer
    public void setTokenPreProcessor(TokenPreProcess tokenPreProcess) {
        this.tokenPreProcess = tokenPreProcess;
    }

    private List<String> tokenize(NavigableMap<String, Integer> navigableMap, String str) {
        ArrayList arrayList = new ArrayList();
        String str2 = str;
        if (this.preTokenizePreProcessor != null) {
            str2 = this.preTokenizePreProcessor.preProcess(str);
        }
        for (String str3 : splitPattern.split(str2)) {
            String str4 = str3;
            int i = 0;
            while (str4.length() > 0 && !"##".equals(str4)) {
                String findLongestSubstring = findLongestSubstring(navigableMap, str4);
                arrayList.add(findLongestSubstring);
                str4 = "##" + str4.substring(findLongestSubstring.length());
                int i2 = i;
                i++;
                if (i2 > str3.length()) {
                    throw new IllegalStateException("Invalid token encountered: \"" + str3 + "\" likely contains characters that are not present in the vocabulary. Invalid tokens may be cleaned in a preprocessing step using a TokenPreProcessor. preTokenizePreProcessor=" + this.preTokenizePreProcessor + ", tokenPreProcess=" + this.tokenPreProcess);
                }
            }
        }
        return arrayList;
    }

    protected String findLongestSubstring(NavigableMap<String, Integer> navigableMap, String str) {
        NavigableMap<String, Integer> tailMap = navigableMap.tailMap(str, true);
        checkIfEmpty(tailMap, str);
        String firstKey = tailMap.firstKey();
        int min = Math.min(str.length(), firstKey.length());
        while (!str.startsWith(firstKey)) {
            min--;
            tailMap = tailMap.tailMap(str.substring(0, min), true);
            checkIfEmpty(tailMap, str);
            firstKey = tailMap.firstKey();
        }
        return firstKey;
    }

    protected void checkIfEmpty(Map<String, Integer> map, String str) {
        if (map.isEmpty()) {
            throw new IllegalStateException("Invalid token/character encountered: \"" + str + "\" likely contains characters that are not present in the vocabulary. Invalid tokens may be cleaned in a preprocessing step using a TokenPreProcessor. preTokenizePreProcessor=" + this.preTokenizePreProcessor + ", tokenPreProcess=" + this.tokenPreProcess);
        }
    }
}
