/*
 * Decompiled with CFR 0.152.
 */
package org.opensearch.neuralsearch.processor.chunker;

import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.opensearch.action.admin.indices.analyze.AnalyzeAction;
import org.opensearch.action.admin.indices.analyze.TransportAnalyzeAction;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.neuralsearch.processor.chunker.Chunker;
import org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser;
import org.opensearch.neuralsearch.processor.chunker.ChunkerUtil;

public final class FixedTokenLengthChunker
implements Chunker {
    public static final String ALGORITHM_NAME = "fixed_token_length";
    public static final String ANALYSIS_REGISTRY_FIELD = "analysis_registry";
    public static final String TOKEN_LIMIT_FIELD = "token_limit";
    public static final String OVERLAP_RATE_FIELD = "overlap_rate";
    public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count";
    public static final String TOKENIZER_FIELD = "tokenizer";
    private static final int DEFAULT_TOKEN_LIMIT = 384;
    private static final double DEFAULT_OVERLAP_RATE = 0.0;
    private static final int DEFAULT_MAX_TOKEN_COUNT = 10000;
    private static final String DEFAULT_TOKENIZER = "standard";
    private static final double OVERLAP_RATE_LOWER_BOUND = 0.0;
    private static final double OVERLAP_RATE_UPPER_BOUND = 0.5;
    private static final Set<String> WORD_TOKENIZERS = Set.of("standard", "letter", "lowercase", "whitespace", "uax_url_email", "classic", "thai");
    private int tokenLimit;
    private int maxChunkLimit;
    private String tokenizer;
    private double overlapRate;
    private final AnalysisRegistry analysisRegistry;

    public FixedTokenLengthChunker(Map<String, Object> parameters) {
        this.parseParameters(parameters);
        this.analysisRegistry = (AnalysisRegistry)parameters.get(ANALYSIS_REGISTRY_FIELD);
    }

    @Override
    public void parseParameters(Map<String, Object> parameters) {
        this.tokenLimit = ChunkerParameterParser.parsePositiveIntegerParameter(parameters, TOKEN_LIMIT_FIELD, 384);
        this.overlapRate = ChunkerParameterParser.parseDoubleParameter(parameters, OVERLAP_RATE_FIELD, 0.0);
        this.tokenizer = ChunkerParameterParser.parseStringParameter(parameters, TOKENIZER_FIELD, DEFAULT_TOKENIZER);
        this.maxChunkLimit = ChunkerParameterParser.parseIntegerParameter(parameters, "max_chunk_limit", 100);
        if (this.overlapRate < 0.0 || this.overlapRate > 0.5) {
            throw new IllegalArgumentException(String.format(Locale.ROOT, "Parameter [%s] must be between %s and %s", OVERLAP_RATE_FIELD, 0.0, 0.5));
        }
        if (!WORD_TOKENIZERS.contains(this.tokenizer)) {
            throw new IllegalArgumentException(String.format(Locale.ROOT, "Tokenizer [%s] is not supported for [%s] algorithm. Supported tokenizers are %s", this.tokenizer, ALGORITHM_NAME, WORD_TOKENIZERS));
        }
    }

    @Override
    public List<String> chunk(String content, Map<String, Object> runtimeParameters) {
        int maxTokenCount = ChunkerParameterParser.parsePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, 10000);
        int runtimeMaxChunkLimit = ChunkerParameterParser.parseIntegerParameter(runtimeParameters, "max_chunk_limit", this.maxChunkLimit);
        List<AnalyzeAction.AnalyzeToken> tokens = this.tokenize(content, this.tokenizer, maxTokenCount);
        ArrayList<String> chunkResult = new ArrayList<String>();
        int overlapTokenNumber = (int)Math.floor((double)this.tokenLimit * this.overlapRate);
        for (int startTokenIndex = 0; startTokenIndex < tokens.size(); startTokenIndex += this.tokenLimit - overlapTokenNumber) {
            int endContentPosition;
            ChunkerUtil.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, this.maxChunkLimit);
            int startContentPosition = startTokenIndex == 0 ? 0 : tokens.get(startTokenIndex).getStartOffset();
            if (startTokenIndex + this.tokenLimit >= tokens.size()) {
                endContentPosition = content.length();
                chunkResult.add(content.substring(startContentPosition, endContentPosition));
                break;
            }
            endContentPosition = tokens.get(startTokenIndex + this.tokenLimit).getStartOffset();
            chunkResult.add(content.substring(startContentPosition, endContentPosition));
        }
        return chunkResult;
    }

    private List<AnalyzeAction.AnalyzeToken> tokenize(String content, String tokenizer, int maxTokenCount) {
        AnalyzeAction.Request analyzeRequest = new AnalyzeAction.Request();
        analyzeRequest.text(new String[]{content});
        analyzeRequest.tokenizer(tokenizer);
        try {
            AnalyzeAction.Response analyzeResponse = TransportAnalyzeAction.analyze((AnalyzeAction.Request)analyzeRequest, (AnalysisRegistry)this.analysisRegistry, null, (int)maxTokenCount);
            return analyzeResponse.getTokens();
        }
        catch (Exception e) {
            throw new IllegalStateException(String.format(Locale.ROOT, "analyzer %s throws exception: %s", tokenizer, e.getMessage()), e);
        }
    }
}

