package org.deeplearning4j.bagofwords.vectorizer;

import java.util.ArrayList;
import java.util.Collection;
import org.deeplearning4j.models.sequencevectors.iterators.AbstractSequenceIterator;
import org.deeplearning4j.models.sequencevectors.transformers.impl.SentenceTransformer;
import org.deeplearning4j.models.word2vec.VocabWord;
import org.deeplearning4j.models.word2vec.wordstore.VocabCache;
import org.deeplearning4j.models.word2vec.wordstore.VocabConstructor;
import org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache;
import org.deeplearning4j.text.documentiterator.LabelAwareIterator;
import org.deeplearning4j.text.documentiterator.LabelsSource;
import org.deeplearning4j.text.invertedindex.InvertedIndex;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;

/* loaded from: input_file:org/deeplearning4j/bagofwords/vectorizer/BaseTextVectorizer.class */
public abstract class BaseTextVectorizer implements TextVectorizer {
    protected transient TokenizerFactory tokenizerFactory;
    protected transient LabelAwareIterator iterator;
    protected int minWordFrequency;
    protected VocabCache<VocabWord> vocabCache;
    protected LabelsSource labelsSource;
    protected transient InvertedIndex<VocabWord> index;
    protected Collection<String> stopWords = new ArrayList();
    protected boolean isParallel = true;

    public LabelsSource getLabelsSource() {
        return this.labelsSource;
    }

    public void buildVocab() {
        if (this.vocabCache == null) {
            this.vocabCache = new AbstractCache.Builder().build();
        }
        new VocabConstructor.Builder().addSource(new AbstractSequenceIterator.Builder(new SentenceTransformer.Builder().iterator(this.iterator).tokenizerFactory(this.tokenizerFactory).build()).build(), this.minWordFrequency).setTargetVocabCache(this.vocabCache).setStopWords(this.stopWords).allowParallelTokenization(this.isParallel).build().buildJointVocabulary(false, true);
    }

    @Override // org.deeplearning4j.bagofwords.vectorizer.TextVectorizer
    public void fit() {
        buildVocab();
    }

    @Override // org.deeplearning4j.bagofwords.vectorizer.TextVectorizer
    public long numWordsEncountered() {
        return this.vocabCache.totalWordOccurrences();
    }

    public void setTokenizerFactory(TokenizerFactory tokenizerFactory) {
        this.tokenizerFactory = tokenizerFactory;
    }

    @Override // org.deeplearning4j.bagofwords.vectorizer.TextVectorizer
    public VocabCache<VocabWord> getVocabCache() {
        return this.vocabCache;
    }

    @Override // org.deeplearning4j.bagofwords.vectorizer.TextVectorizer
    public InvertedIndex<VocabWord> getIndex() {
        return this.index;
    }

    public boolean isParallel() {
        return this.isParallel;
    }
}
