/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysds.runtime.transform.tokenize;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.apache.sysds.runtime.matrix.data.FrameBlock;
import org.apache.sysds.runtime.transform.tokenize.Tokenizer;
import org.apache.sysds.runtime.transform.tokenize.TokenizerPre;
import org.apache.sysds.runtime.transform.tokenize.TokenizerPreWhitespaceSplit;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

public class TokenizerPreNgram
implements TokenizerPre {
    private static final long serialVersionUID = -6297904316677723802L;
    private final TokenizerPreWhitespaceSplit tokenizerPreWhitespaceSplit;
    private final Params params;

    public TokenizerPreNgram(List<Integer> idCols, int tokenizeCol, JSONObject params) throws JSONException {
        this.tokenizerPreWhitespaceSplit = new TokenizerPreWhitespaceSplit(idCols, tokenizeCol, params);
        this.params = new Params(params);
    }

    public List<Tokenizer.Token> wordTokenToNgrams(Tokenizer.Token wordTokens) {
        ArrayList<Tokenizer.Token> ngramTokens = new ArrayList<Tokenizer.Token>();
        int tokenLen = wordTokens.textToken.length();
        int startPos = this.params.minGram - this.params.maxGram;
        int endPos = Math.max(tokenLen - this.params.minGram, startPos);
        for (int i = startPos; i <= endPos; ++i) {
            int startSlice = Math.max(i, 0);
            int endSlice = Math.min(i + this.params.maxGram, tokenLen);
            String substring = wordTokens.textToken.substring(startSlice, endSlice);
            long tokenStart = wordTokens.startIndex + (long)startSlice;
            ngramTokens.add(new Tokenizer.Token(substring, tokenStart));
        }
        return ngramTokens;
    }

    public List<Tokenizer.Token> wordTokenListToNgrams(List<Tokenizer.Token> wordTokens) {
        ArrayList<Tokenizer.Token> ngramTokens = new ArrayList<Tokenizer.Token>();
        for (Tokenizer.Token wordToken : wordTokens) {
            List<Tokenizer.Token> ngramTokensForWord = this.wordTokenToNgrams(wordToken);
            ngramTokens.addAll(ngramTokensForWord);
        }
        return ngramTokens;
    }

    @Override
    public List<Tokenizer.DocumentToTokens> tokenizePre(FrameBlock in) {
        List<Tokenizer.DocumentToTokens> docToWordTokens = this.tokenizerPreWhitespaceSplit.tokenizePre(in);
        ArrayList<Tokenizer.DocumentToTokens> docToNgramTokens = new ArrayList<Tokenizer.DocumentToTokens>();
        for (Tokenizer.DocumentToTokens docToTokens : docToWordTokens) {
            List<Object> keys = docToTokens.keys;
            List<Tokenizer.Token> wordTokens = docToTokens.tokens;
            List<Tokenizer.Token> ngramTokens = this.wordTokenListToNgrams(wordTokens);
            docToNgramTokens.add(new Tokenizer.DocumentToTokens(keys, ngramTokens));
        }
        return docToNgramTokens;
    }

    static class Params
    implements Serializable {
        private static final long serialVersionUID = -6516419749810062677L;
        public int minGram = 1;
        public int maxGram = 2;

        public Params(JSONObject json) throws JSONException {
            if (json != null && json.has("min_gram")) {
                this.minGram = json.getInt("min_gram");
            }
            if (json != null && json.has("max_gram")) {
                this.maxGram = json.getInt("max_gram");
            }
        }
    }
}

