/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysds.runtime.compress.lib;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
import org.apache.sysds.runtime.compress.CompressedMatrixBlockFactory;
import org.apache.sysds.runtime.compress.colgroup.AColGroup;
import org.apache.sysds.runtime.compress.colgroup.ADictBasedColGroup;
import org.apache.sysds.runtime.compress.colgroup.ASDCZero;
import org.apache.sysds.runtime.compress.colgroup.ColGroupConst;
import org.apache.sysds.runtime.compress.colgroup.ColGroupDDC;
import org.apache.sysds.runtime.compress.colgroup.ColGroupEmpty;
import org.apache.sysds.runtime.compress.colgroup.IMapToDataGroup;
import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary;
import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
import org.apache.sysds.runtime.compress.lib.CLALibScalar;
import org.apache.sysds.runtime.compress.lib.CLALibUtils;
import org.apache.sysds.runtime.data.DenseBlock;
import org.apache.sysds.runtime.data.DenseBlockFP64;
import org.apache.sysds.runtime.data.SparseBlock;
import org.apache.sysds.runtime.data.SparseBlockMCSR;
import org.apache.sysds.runtime.data.SparseRow;
import org.apache.sysds.runtime.data.SparseRowScalar;
import org.apache.sysds.runtime.data.SparseRowVector;
import org.apache.sysds.runtime.frame.data.columns.HashMapToInt;
import org.apache.sysds.runtime.functionobjects.Divide;
import org.apache.sysds.runtime.functionobjects.Minus;
import org.apache.sysds.runtime.functionobjects.Multiply;
import org.apache.sysds.runtime.functionobjects.Plus;
import org.apache.sysds.runtime.functionobjects.Power;
import org.apache.sysds.runtime.functionobjects.ValueComparisonFunction;
import org.apache.sysds.runtime.functionobjects.ValueFunction;
import org.apache.sysds.runtime.matrix.data.LibMatrixBincell;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
import org.apache.sysds.runtime.matrix.data.Pair;
import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
import org.apache.sysds.runtime.matrix.operators.LeftScalarOperator;
import org.apache.sysds.runtime.matrix.operators.RightScalarOperator;
import org.apache.sysds.runtime.util.CommonThreadPool;
import org.apache.sysds.utils.DMLCompressionStatistics;
import org.apache.sysds.utils.stats.Timing;

public final class CLALibBinaryCellOp {
    private static final Log LOG = LogFactory.getLog((String)CLALibBinaryCellOp.class.getName());
    public static final int DECOMPRESSION_BLEN = 16384;

    private CLALibBinaryCellOp() {
    }

    public static MatrixBlock binaryOperationsRight(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that) {
        try {
            op = LibMatrixBincell.replaceOpWithSparseSafeIfApplicable(m1, that, op);
            if (that.getNumRows() == 1 && that.getNumColumns() == 1 || that.isEmpty()) {
                RightScalarOperator sop = new RightScalarOperator(op.fn, that.get(0, 0), op.getNumThreads());
                return CLALibScalar.scalarOperations(sop, m1, null);
            }
            return CLALibBinaryCellOp.binaryOperationsRightFiltered(op, m1, that);
        }
        catch (Exception e) {
            throw new DMLRuntimeException("Failed Right Binary Compressed Operation", e);
        }
    }

    public static MatrixBlock binaryOperationsLeft(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that) {
        try {
            op = LibMatrixBincell.replaceOpWithSparseSafeIfApplicable(m1, that, op);
            if (that.getNumRows() == 1 && that.getNumColumns() == 1 || that.isEmpty()) {
                LeftScalarOperator sop = new LeftScalarOperator(op.fn, that.get(0, 0), op.getNumThreads());
                return CLALibScalar.scalarOperations(sop, m1, null);
            }
            that = CompressedMatrixBlock.getUncompressed(that, "Decompressing left side in BinaryOps");
            LibMatrixBincell.BinaryAccessType atype = LibMatrixBincell.getBinaryAccessTypeExtended(that, m1);
            return CLALibBinaryCellOp.selectProcessingBasedOnAccessType(op, m1, that, atype, true);
        }
        catch (Exception e) {
            throw new DMLRuntimeException("Failed Left Binary Compressed Operation", e);
        }
    }

    private static MatrixBlock binaryOperationsRightFiltered(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that) throws Exception {
        LibMatrixBincell.BinaryAccessType atype = LibMatrixBincell.getBinaryAccessTypeExtended(m1, that);
        if (CLALibBinaryCellOp.isDoubleCompressedOpApplicable(m1, that)) {
            return CLALibBinaryCellOp.doubleCompressedBinaryOp(op, m1, (CompressedMatrixBlock)that);
        }
        if (that instanceof CompressedMatrixBlock && that.getNumColumns() == m1.getNumColumns() && that.getInMemorySize() < m1.getInMemorySize()) {
            MatrixBlock m1uc = CompressedMatrixBlock.getUncompressed(m1, "Decompressing left side in BinaryOps");
            return CLALibBinaryCellOp.selectProcessingBasedOnAccessType(op, (CompressedMatrixBlock)that, m1uc, atype, true);
        }
        that = CompressedMatrixBlock.getUncompressed(that, "Decompressing right side in BinaryOps");
        return CLALibBinaryCellOp.selectProcessingBasedOnAccessType(op, m1, that, atype, false);
    }

    private static boolean isDoubleCompressedOpApplicable(CompressedMatrixBlock m1, MatrixBlock that) {
        return that instanceof CompressedMatrixBlock && !m1.isOverlapping() && m1.getColGroups().get(0) instanceof ColGroupDDC && !((CompressedMatrixBlock)that).isOverlapping() && ((CompressedMatrixBlock)that).getColGroups().get(0) instanceof ColGroupDDC && ((IMapToDataGroup)((Object)m1.getColGroups().get(0))).getMapToData() == ((IMapToDataGroup)((Object)((CompressedMatrixBlock)that).getColGroups().get(0))).getMapToData();
    }

    private static CompressedMatrixBlock doubleCompressedBinaryOp(BinaryOperator op, CompressedMatrixBlock m1, CompressedMatrixBlock m2) {
        LOG.debug((Object)"Double Compressed BinaryOp");
        AColGroup left = m1.getColGroups().get(0);
        AColGroup right = m2.getColGroups().get(0);
        AMapToData lm = ((IMapToDataGroup)((Object)left)).getMapToData();
        MatrixBlock lmb = ((ADictBasedColGroup)left).getDictionary().getMBDict(m1.getNumColumns()).getMatrixBlock();
        MatrixBlock rmb = ((ADictBasedColGroup)right).getDictionary().getMBDict(m2.getNumColumns()).getMatrixBlock();
        MatrixBlock out = lmb.binaryOperations(op, rmb);
        AColGroup rgroup = ColGroupDDC.create(left.getColIndices(), MatrixBlockDictionary.create(out), lm, null);
        CompressedMatrixBlock outCompressed = new CompressedMatrixBlock(m1.getNumRows(), m1.getNumColumns());
        outCompressed.allocateColGroup(rgroup);
        return outCompressed;
    }

    private static MatrixBlock selectProcessingBasedOnAccessType(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that, LibMatrixBincell.BinaryAccessType atype, boolean left) throws Exception {
        switch (atype) {
            case MATRIX_MATRIX: {
                return CLALibBinaryCellOp.mm(op, m1, that, left);
            }
            case COL_VECTOR_MATRIX: 
            case MATRIX_COL_VECTOR: {
                return CLALibBinaryCellOp.mvCol(op, m1, that, left);
            }
            case MATRIX_ROW_VECTOR: 
            case ROW_VECTOR_MATRIX: {
                return CLALibBinaryCellOp.mvRow(m1, that, op, left);
            }
            case OUTER_VECTOR_VECTOR: {
                return CompressedMatrixBlock.getUncompressed(m1, "OVV BinaryOp: " + op.fn).binaryOperations(op, that);
            }
        }
        int rlen1 = m1.getNumRows();
        int rlen2 = that.getNumRows();
        int clen1 = m1.getNumColumns();
        int clen2 = that.getNumColumns();
        throw new RuntimeException("Block sizes are not matched for binary cell operations: " + rlen1 + "x" + clen1 + " vs " + rlen2 + "x" + clen2);
    }

    private static MatrixBlock mm(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that, boolean left) throws Exception {
        MatrixBlock d_compressed = m1.getCachedDecompressed();
        if (d_compressed != null) {
            if (left) {
                return that.binaryOperations(op, d_compressed);
            }
            return d_compressed.binaryOperations(op, that);
        }
        return CLALibBinaryCellOp.mmCompressed(m1, that, op, left);
    }

    private static MatrixBlock mvCol(BinaryOperator op, CompressedMatrixBlock m1, MatrixBlock that, boolean left) throws Exception {
        MatrixBlock d_compressed = m1.getCachedDecompressed();
        if (d_compressed != null) {
            if (left) {
                throw new NotImplementedException("Binary row op left is not supported for Uncompressed Matrix, Implement support for VMr in MatrixBlock Binary Cell operations");
            }
            return d_compressed.binaryOperations(op, that);
        }
        that.sparseToDense(op.getNumThreads());
        return CLALibBinaryCellOp.mvColCompressed(m1, that, op, left);
    }

    private static MatrixBlock mvRow(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) throws Exception {
        CompressedMatrixBlock cRet = new CompressedMatrixBlock(m1.getNumRows(), m1.getNumColumns());
        if (CLALibBinaryCellOp.isValidForOverlappingBinaryCellOperations(m1, op)) {
            return CLALibBinaryCellOp.binaryMVPlusStack(m1, m2, cRet, op, left);
        }
        if (CLALibBinaryCellOp.isSupportedOverlappingBinCell(m1, m2, op.fn)) {
            return CLALibBinaryCellOp.binaryMVRow(m1, m2, cRet, op, left);
        }
        return CompressedMatrixBlock.getUncompressed(m1, "BinaryOp: " + op.fn).binaryOperations(op, m2);
    }

    private static boolean isSupportedOverlappingBinCell(CompressedMatrixBlock m1, MatrixBlock m2, ValueFunction fn) {
        return !m1.isOverlapping() && (!(fn instanceof Power) || !(m1.getSparsity() < 1.0) || !CLALibBinaryCellOp.containsNegative(m2)) || fn instanceof Multiply || fn instanceof Divide;
    }

    private static boolean containsNegative(MatrixBlock rm) {
        int clen = rm.getNumColumns();
        for (int i = 0; i < clen; ++i) {
            if (!(rm.get(0, i) < 0.0)) continue;
            return true;
        }
        return false;
    }

    private static boolean isValidForOverlappingBinaryCellOperations(CompressedMatrixBlock m1, BinaryOperator op) {
        return m1.isOverlapping() && (op.fn instanceof Plus || op.fn instanceof Minus);
    }

    private static CompressedMatrixBlock binaryMVRow(CompressedMatrixBlock m1, double[] v, CompressedMatrixBlock ret, BinaryOperator op, boolean left) throws Exception {
        boolean isRowSafe;
        List<AColGroup> oldColGroups = m1.getColGroups();
        int k = op.getNumThreads();
        ArrayList<AColGroup> newColGroups = new ArrayList<AColGroup>(oldColGroups.size());
        boolean bl = isRowSafe = left ? op.isRowSafeLeft(v) : op.isRowSafeRight(v);
        if (k <= 1 || oldColGroups.size() <= 1) {
            CLALibBinaryCellOp.binaryMVRowSingleThread(oldColGroups, v, op, left, newColGroups, isRowSafe);
        } else {
            CLALibBinaryCellOp.binaryMVRowMultiThread(oldColGroups, v, op, left, newColGroups, isRowSafe, k);
        }
        ret.allocateColGroupList(newColGroups);
        ret.setOverlapping(m1.isOverlapping());
        ret.examSparsity(op.getNumThreads());
        return ret;
    }

    private static void binaryMVRowSingleThread(List<AColGroup> oldColGroups, double[] v, BinaryOperator op, boolean left, List<AColGroup> newColGroups, boolean isRowSafe) {
        if (left) {
            for (AColGroup grp : oldColGroups) {
                newColGroups.add(grp.binaryRowOpLeft(op, v, isRowSafe));
            }
        } else {
            for (AColGroup grp : oldColGroups) {
                newColGroups.add(grp.binaryRowOpRight(op, v, isRowSafe));
            }
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private static void binaryMVRowMultiThread(List<AColGroup> oldColGroups, double[] v, BinaryOperator op, boolean left, List<AColGroup> newColGroups, boolean isRowSafe, int k) throws Exception {
        ExecutorService pool = CommonThreadPool.get(k);
        try {
            ArrayList<BinaryMVRowTask> tasks = new ArrayList<BinaryMVRowTask>();
            if (left) {
                for (AColGroup aColGroup : oldColGroups) {
                    tasks.add(new BinaryMVRowTaskLeft(aColGroup, v, op, isRowSafe));
                }
            } else {
                for (AColGroup aColGroup : oldColGroups) {
                    tasks.add(new BinaryMVRowTaskRight(aColGroup, v, op, isRowSafe));
                }
            }
            for (Future future : pool.invokeAll(tasks)) {
                newColGroups.add((AColGroup)future.get());
            }
        }
        finally {
            pool.shutdown();
        }
    }

    private static CompressedMatrixBlock binaryMVRow(CompressedMatrixBlock m1, MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op, boolean left) throws Exception {
        return CLALibBinaryCellOp.binaryMVRow(m1, CLALibBinaryCellOp.forceRowToDense(m2), ret, op, left);
    }

    private static double[] forceRowToDense(MatrixBlock m2) {
        double[] v;
        if (m2.isInSparseFormat()) {
            SparseBlock sb = m2.getSparseBlock();
            double[] spV = sb.values(0);
            int[] spI = sb.indexes(0);
            v = new double[m2.getNumColumns()];
            for (int i = sb.pos(0); i < sb.size(0); ++i) {
                v[spI[i]] = spV[i];
            }
        } else {
            v = m2.getDenseBlockValues();
        }
        return v;
    }

    protected static MatrixBlock binaryMVPlusStack(CompressedMatrixBlock m1, MatrixBlock m2, CompressedMatrixBlock ret, BinaryOperator op, boolean left) {
        List<AColGroup> oldColGroups = m1.getColGroups();
        int size = oldColGroups.size();
        ArrayList<AColGroup> newColGroups = new ArrayList<AColGroup>(size);
        int nCol = m1.getNumColumns();
        int smallestIndex = 0;
        int smallestSize = Integer.MAX_VALUE;
        for (int i = 0; i < size; ++i) {
            AColGroup g = oldColGroups.get(i);
            int newSize = g.getNumValues();
            newColGroups.add(g);
            if (newSize >= smallestSize || g.getNumCols() != nCol || g instanceof ASDCZero) continue;
            smallestIndex = i;
            smallestSize = newSize;
        }
        if (smallestSize == Integer.MAX_VALUE) {
            CLALibBinaryCellOp.stackConstGroup(m2, op, left, newColGroups, nCol, m2.getDenseBlockValues());
        } else {
            CLALibBinaryCellOp.stackModifiedGroup(op, left, newColGroups, smallestIndex, m2.getDenseBlockValues());
        }
        if (newColGroups.size() == 0) {
            return new MatrixBlock(ret.getNumRows(), ret.getNumColumns(), 0.0);
        }
        ret.allocateColGroupList(newColGroups);
        ret.setOverlapping(true);
        ret.setNonZeros(-1L);
        return ret;
    }

    private static void stackModifiedGroup(BinaryOperator op, boolean left, List<AColGroup> newColGroups, int smallestIndex, double[] row) {
        AColGroup g = newColGroups.get(smallestIndex);
        g = left ? g.binaryRowOpLeft(op, row, op.isRowSafeLeft(row)) : g.binaryRowOpRight(op, row, op.isRowSafeRight(row));
        if (!(g instanceof ColGroupEmpty)) {
            newColGroups.set(smallestIndex, g);
        } else {
            newColGroups.remove(smallestIndex);
        }
    }

    private static void stackConstGroup(MatrixBlock m2, BinaryOperator op, boolean left, List<AColGroup> newColGroups, int nCol, double[] row) {
        AColGroup g;
        if (row == null) {
            double[] gVals = new double[nCol];
            SparseBlock sb = m2.getSparseBlock();
            double[] avals = sb.values(0);
            int[] aix = sb.indexes(0);
            int alen = sb.size(0);
            if (left) {
                for (int i = 0; i < alen; ++i) {
                    gVals[aix[i]] = op.fn.execute(avals[i], 0.0);
                }
            } else {
                for (int i = 0; i < alen; ++i) {
                    gVals[aix[i]] = op.fn.execute(0.0, avals[i]);
                }
            }
            g = ColGroupConst.create(gVals);
        } else {
            g = ColGroupConst.create(nCol, 0.0);
            g = left ? g.binaryRowOpLeft(op, row, op.isRowSafeLeft(row)) : g.binaryRowOpRight(op, row, op.isRowSafeRight(row));
        }
        if (!(g instanceof ColGroupEmpty)) {
            newColGroups.add(g);
        }
    }

    private static MatrixBlock mvColCompressed(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) throws Exception {
        int nCols = m1.getNumColumns();
        int nRows = m1.getNumRows();
        m1 = CLALibBinaryCellOp.morph(m1);
        int k = op.getNumThreads();
        long nnz = 0L;
        Pair<Double, Double> tuple = CLALibBinaryCellOp.evaluateSparsityMVCol(m1, m2, op, left);
        double estSparsity = tuple.getKey();
        double estNnzPerRow = tuple.getValue();
        boolean shouldBeSparseOut = MatrixBlock.evalSparseFormatInMemory(nRows, nCols, (long)(estSparsity * (double)nRows * (double)nCols));
        if (estNnzPerRow <= 2.0 && nCols <= 31 && op.fn instanceof ValueComparisonFunction) {
            return k <= 1 ? CLALibBinaryCellOp.binaryMVComparisonColSingleThreadCompressed(m1, m2, op, left) : CLALibBinaryCellOp.binaryMVComparisonColMultiCompressed(m1, m2, op, left);
        }
        MatrixBlock ret = new MatrixBlock(nRows, nCols, shouldBeSparseOut, -1L).allocateBlock();
        nnz = shouldBeSparseOut ? (k <= 1 ? CLALibBinaryCellOp.binaryMVColSingleThreadSparse(m1, m2, op, left, ret) : CLALibBinaryCellOp.binaryMVColMultiThreadSparse(m1, m2, op, left, ret)) : (k <= 1 ? CLALibBinaryCellOp.binaryMVColSingleThreadDense(m1, m2, op, left, ret) : CLALibBinaryCellOp.binaryMVColMultiThreadDense(m1, m2, op, left, ret));
        if (op.fn instanceof ValueComparisonFunction) {
            if (nnz == (long)nRows * (long)nCols) {
                return CompressedMatrixBlockFactory.createConstant(nRows, nCols, 1.0);
            }
            if (nnz == 0L) {
                return new MatrixBlock(nRows, nCols, 0.0);
            }
        }
        ret.setNonZeros(nnz);
        ret.examSparsity(op.getNumThreads());
        return ret;
    }

    private static MatrixBlock binaryMVComparisonColSingleThreadCompressed(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) {
        int nRows = m1.getNumRows();
        int nCols = m1.getNumColumns();
        BinaryMVColTaskCompressed task = new BinaryMVColTaskCompressed(m1, m2, 0, nRows, op, left);
        long nnz = task.call();
        int[] indicators = task._ret;
        HashMapToInt<Integer> hm = new HashMapToInt<Integer>(nCols * 3);
        int[] colMap = new int[nRows];
        for (int i = 0; i < m1.getNumRows(); ++i) {
            int nextId = hm.size();
            int id = hm.putIfAbsentI(indicators[i], nextId);
            colMap[i] = id == -1 ? nextId : id;
        }
        MatrixBlock outMb = CLALibBinaryCellOp.getMCSRMatrixBlock(hm, nCols);
        return CLALibBinaryCellOp.getCompressedMatrixBlock(m1, colMap, hm.size(), outMb, nRows, nCols, nnz);
    }

    private static void fillSparseBlockFromIndicatorFromIndicatorInt(int numCol, Integer indicator, Integer rix, SparseBlockMCSR out) {
        ArrayList<Integer> colIndices = new ArrayList<Integer>(8);
        for (int c = numCol - 1; c >= 0 && indicator > 0; --c) {
            if (indicator % 2 == 1) {
                colIndices.add(c);
            }
            indicator = indicator >> 1;
        }
        SparseRow row = null;
        if (colIndices.size() > 1) {
            double[] vals = new double[colIndices.size()];
            Arrays.fill(vals, 1.0);
            int[] indices = new int[colIndices.size()];
            int i = 0;
            int j = colIndices.size() - 1;
            while (i < colIndices.size()) {
                indices[i] = (Integer)colIndices.get(j);
                ++i;
                --j;
            }
            row = new SparseRowVector(vals, indices);
        } else if (colIndices.size() == 1) {
            row = new SparseRowScalar((Integer)colIndices.get(0), 1.0);
        }
        out.set((int)rix, row, false);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private static MatrixBlock binaryMVComparisonColMultiCompressed(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) throws Exception {
        int nRows = m1.getNumRows();
        int nCols = m1.getNumColumns();
        int k = op.getNumThreads();
        int blkz = nRows / k;
        long nnz = 0L;
        ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
        try {
            ArrayList<BinaryMVColTaskCompressed> tasks = new ArrayList<BinaryMVColTaskCompressed>();
            for (int i = 0; i < nRows; i += blkz) {
                tasks.add(new BinaryMVColTaskCompressed(m1, m2, i, Math.min(nRows, i + blkz), op, left));
            }
            List futures = pool.invokeAll(tasks);
            HashMapToInt<Integer> hm = new HashMapToInt<Integer>(nCols * 2);
            int[] colMap = new int[nRows];
            for (Future f : futures) {
                nnz += ((Long)f.get()).longValue();
            }
            CLALibBinaryCellOp.mergeMVColTaskResults(tasks, blkz, hm, colMap);
            MatrixBlock outMb = CLALibBinaryCellOp.getMCSRMatrixBlock(hm, nCols);
            CompressedMatrixBlock compressedMatrixBlock = CLALibBinaryCellOp.getCompressedMatrixBlock(m1, colMap, hm.size(), outMb, nRows, nCols, nnz);
            return compressedMatrixBlock;
        }
        finally {
            pool.shutdown();
        }
    }

    private static void mergeMVColTaskResults(ArrayList<BinaryMVColTaskCompressed> tasks, int blkz, HashMapToInt<Integer> hm, int[] colMap) {
        for (int j = 0; j < tasks.size(); ++j) {
            int i;
            int[] indicators = tasks.get((int)j)._ret;
            int offset = j * blkz;
            int remainders = indicators.length % 8;
            int endVecLen = indicators.length - remainders;
            for (i = 0; i < endVecLen; i += 8) {
                colMap[offset + i] = hm.putIfAbsentReturnVal(indicators[i], hm.size());
                colMap[offset + i + 1] = hm.putIfAbsentReturnVal(indicators[i + 1], hm.size());
                colMap[offset + i + 2] = hm.putIfAbsentReturnVal(indicators[i + 2], hm.size());
                colMap[offset + i + 3] = hm.putIfAbsentReturnVal(indicators[i + 3], hm.size());
                colMap[offset + i + 4] = hm.putIfAbsentReturnVal(indicators[i + 4], hm.size());
                colMap[offset + i + 5] = hm.putIfAbsentReturnVal(indicators[i + 5], hm.size());
                colMap[offset + i + 6] = hm.putIfAbsentReturnVal(indicators[i + 6], hm.size());
                colMap[offset + i + 7] = hm.putIfAbsentReturnVal(indicators[i + 7], hm.size());
            }
            for (i = 0; i < remainders; ++i) {
                colMap[offset + endVecLen + i] = hm.putIfAbsentReturnVal(indicators[endVecLen + i], hm.size());
            }
        }
    }

    private static CompressedMatrixBlock getCompressedMatrixBlock(CompressedMatrixBlock m1, int[] colMap, int mapSize, MatrixBlock outMb, int nRows, int nCols, long nnz) {
        IColIndex i = ColIndexFactory.create(0, m1.getNumColumns());
        AMapToData map = MapToFactory.create(m1.getNumRows(), colMap, mapSize);
        AColGroup rgroup = ColGroupDDC.create(i, MatrixBlockDictionary.create(outMb), map, null);
        ArrayList<AColGroup> groups = new ArrayList<AColGroup>(1);
        groups.add(rgroup);
        return new CompressedMatrixBlock(nRows, nCols, nnz, false, groups);
    }

    private static MatrixBlock getMCSRMatrixBlock(HashMapToInt<Integer> hm, int nCols) {
        SparseBlockMCSR out = new SparseBlockMCSR(hm.size());
        hm.forEach((indicator, rix) -> CLALibBinaryCellOp.fillSparseBlockFromIndicatorFromIndicatorInt(nCols, indicator, rix, out));
        return new MatrixBlock(hm.size(), nCols, -1L, out);
    }

    private static long binaryMVColSingleThreadDense(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left, MatrixBlock ret) {
        int nRows = m1.getNumRows();
        long nnz = 0L;
        return nnz += new BinaryMVColTaskDense(m1, m2, ret, 0, nRows, op, left).call().longValue();
    }

    private static long binaryMVColSingleThreadSparse(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left, MatrixBlock ret) {
        int nRows = m1.getNumRows();
        long nnz = 0L;
        return nnz += new BinaryMVColTaskSparse(m1, m2, ret, 0, nRows, op, left).call().longValue();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private static long binaryMVColMultiThreadDense(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left, MatrixBlock ret) throws Exception {
        int nRows = m1.getNumRows();
        int k = op.getNumThreads();
        int blkz = ret.getNumRows() / k;
        long nnz = 0L;
        ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
        try {
            ArrayList<BinaryMVColTaskDense> tasks = new ArrayList<BinaryMVColTaskDense>();
            for (int i = 0; i < nRows; i += blkz) {
                tasks.add(new BinaryMVColTaskDense(m1, m2, ret, i, Math.min(nRows, i + blkz), op, left));
            }
            for (Future f : pool.invokeAll(tasks)) {
                nnz += ((Long)f.get()).longValue();
            }
        }
        finally {
            pool.shutdown();
        }
        return nnz;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private static long binaryMVColMultiThreadSparse(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left, MatrixBlock ret) throws Exception {
        int nRows = m1.getNumRows();
        int k = op.getNumThreads();
        int blkz = Math.max(nRows / k, 64);
        long nnz = 0L;
        ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
        try {
            ArrayList<BinaryMVColTaskSparse> tasks = new ArrayList<BinaryMVColTaskSparse>();
            for (int i = 0; i < nRows; i += blkz) {
                tasks.add(new BinaryMVColTaskSparse(m1, m2, ret, i, Math.min(nRows, i + blkz), op, left));
            }
            for (Future f : pool.invokeAll(tasks)) {
                nnz += ((Long)f.get()).longValue();
            }
        }
        finally {
            pool.shutdown();
        }
        return nnz;
    }

    private static MatrixBlock mmCompressed(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) throws Exception {
        int nCols = m1.getNumColumns();
        int nRows = m1.getNumRows();
        m1 = CLALibBinaryCellOp.morph(m1);
        MatrixBlock ret = new MatrixBlock(nRows, nCols, false, -1L).allocateBlock();
        long nnz = CLALibBinaryCellOp.binaryMMExec(m1, m2, op, left, ret);
        ret.setNonZeros(nnz);
        ret.examSparsity(op.getNumThreads());
        return ret;
    }

    private static long binaryMMExec(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left, MatrixBlock ret) throws Exception {
        int nRows = m1.getNumRows();
        int k = op.getNumThreads();
        int blkz = Math.max(ret.getNumRows() / k, 10);
        long nnz = k <= 1 ? CLALibBinaryCellOp.binaryMMSingleThread(m1, m2, op, left, ret, nRows, blkz) : CLALibBinaryCellOp.binaryMMParallel(m1, m2, op, left, ret, nRows, blkz);
        return nnz;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private static long binaryMMParallel(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left, MatrixBlock ret, int nRows, int blkz) throws InterruptedException, ExecutionException {
        ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
        long nnz = 0L;
        try {
            ArrayList<BinaryMMTask> tasks = new ArrayList<BinaryMMTask>();
            for (int i = 0; i < nRows; i += blkz) {
                tasks.add(new BinaryMMTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op, left));
            }
            for (Future f : pool.invokeAll(tasks)) {
                nnz += ((Long)f.get()).longValue();
            }
        }
        finally {
            pool.shutdown();
        }
        return nnz;
    }

    private static long binaryMMSingleThread(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left, MatrixBlock ret, int nRows, int blkz) {
        long nnz = 0L;
        for (int i = 0; i < nRows; i += blkz) {
            nnz += new BinaryMMTask(m1, m2, ret, i, Math.min(nRows, i + blkz), op, left).call().longValue();
        }
        return nnz;
    }

    private static CompressedMatrixBlock morph(CompressedMatrixBlock m) {
        List<AColGroup> groups = m.getColGroups();
        boolean shouldFilter = CLALibUtils.shouldPreFilter(groups);
        if (shouldFilter) {
            CompressedMatrixBlock mf1 = new CompressedMatrixBlock(m);
            int nCols = m.getNumColumns();
            double[] constV = new double[nCols];
            List<AColGroup> filteredGroups = CLALibUtils.filterGroups(groups, constV);
            filteredGroups.add(ColGroupConst.create(constV));
            mf1.allocateColGroupList(filteredGroups);
            return mf1;
        }
        return m;
    }

    private static MatrixBlock allocateTempUncompressedBlock(int cols) {
        MatrixBlock out = new MatrixBlock(Math.max(16384 / cols, 64), cols, false);
        out.allocateBlock();
        return out;
    }

    protected static void decompressToSubBlock(int rl, int ru, DenseBlock db, List<AColGroup> groups, AIterator[] its) {
        Timing time = new Timing(true);
        for (int i = 0; i < groups.size(); ++i) {
            AColGroup g = groups.get(i);
            if (g.getCompType() == AColGroup.CompressionType.SDC) {
                ((ASDCZero)g).decompressToDenseBlock(db, rl, ru, 0, 0, its[i]);
                continue;
            }
            g.decompressToDenseBlock(db, rl, ru, 0, 0);
        }
        if (DMLScript.STATISTICS) {
            double t = time.stop();
            DMLCompressionStatistics.addDecompressToBlockTime(t, 1);
            if (LOG.isTraceEnabled()) {
                LOG.trace((Object)("decompressed block w/ k=1 in " + t + "ms."));
            }
        }
    }

    protected static void decompressToTmpBlock(int rl, int ru, DenseBlock db, List<AColGroup> groups, AIterator[] its) {
        Timing time = new Timing(true);
        for (int i = 0; i < groups.size(); ++i) {
            AColGroup g = groups.get(i);
            if (g.getCompType() == AColGroup.CompressionType.SDC) {
                ((ASDCZero)g).decompressToDenseBlock(db, rl, ru, -rl, 0, its[i]);
                continue;
            }
            g.decompressToDenseBlock(db, rl, ru, -rl, 0);
        }
        if (DMLScript.STATISTICS) {
            double t = time.stop();
            DMLCompressionStatistics.addDecompressToBlockTime(t, 1);
            if (LOG.isTraceEnabled()) {
                LOG.trace((Object)("decompressed block w/ k=1 in " + t + "ms."));
            }
        }
    }

    protected static AIterator[] getIterators(List<AColGroup> groups, int rl) {
        AIterator[] its = new AIterator[groups.size()];
        for (int i = 0; i < groups.size(); ++i) {
            AColGroup g = groups.get(i);
            if (g.getCompType() != AColGroup.CompressionType.SDC) continue;
            its[i] = ((ASDCZero)g).getIterator(rl);
        }
        return its;
    }

    private static Pair<Double, Double> evaluateSparsityMVCol(CompressedMatrixBlock m1, MatrixBlock m2, BinaryOperator op, boolean left) {
        int outVal;
        int c;
        int off;
        double m;
        int r;
        List<AColGroup> groups = m1.getColGroups();
        int nCol = m1.getNumColumns();
        int nRow = m1.getNumRows();
        int sampleRow = Math.min(nRow, 5);
        int sampleCol = nCol;
        int sampleNCells = sampleRow * sampleCol;
        double[] dv = new double[sampleRow * sampleCol];
        double[] m2v = m2.getDenseBlockValues();
        CLALibBinaryCellOp.decompressToDense(groups, sampleRow, sampleCol, dv);
        int nnz = 0;
        int[] nnzPerRow = new int[sampleRow];
        if (left) {
            for (r = 0; r < sampleRow; ++r) {
                m = m2v[r];
                off = r * sampleCol;
                for (c = 0; c < sampleCol; ++c) {
                    outVal = op.fn.execute(m, dv[off + c]) != 0.0 ? 1 : 0;
                    nnz += outVal;
                    int n = r;
                    nnzPerRow[n] = nnzPerRow[n] + outVal;
                }
            }
        } else {
            for (r = 0; r < sampleRow; ++r) {
                m = m2v[r];
                off = r * sampleCol;
                for (c = 0; c < sampleCol; ++c) {
                    outVal = op.fn.execute(dv[off + c], m) != 0.0 ? 1 : 0;
                    nnz += outVal;
                    int n = r;
                    nnzPerRow[n] = nnzPerRow[n] + outVal;
                }
            }
        }
        double sum = 0.0;
        for (int i = 0; i < sampleRow; ++i) {
            sum += (double)nnzPerRow[i];
        }
        return new Pair<Double, Double>((double)nnz / (double)sampleNCells, sum / (double)sampleRow);
    }

    private static void decompressToDense(List<AColGroup> groups, int sampleRow, int sampleCol, double[] dv) {
        DenseBlockFP64 db = new DenseBlockFP64(new int[]{sampleRow, sampleCol}, dv);
        for (int i = 0; i < groups.size(); ++i) {
            groups.get(i).decompressToDenseBlock(db, 0, sampleRow);
        }
    }

    private static class BinaryMVRowTaskRight
    extends BinaryMVRowTask {
        protected BinaryMVRowTaskRight(AColGroup group, double[] v, BinaryOperator op, boolean isRowSafe) {
            super(group, v, op, isRowSafe);
        }

        @Override
        public AColGroup call() {
            return this._group.binaryRowOpRight(this._op, this._v, this._isRowSafe);
        }
    }

    private static class BinaryMVRowTaskLeft
    extends BinaryMVRowTask {
        protected BinaryMVRowTaskLeft(AColGroup group, double[] v, BinaryOperator op, boolean isRowSafe) {
            super(group, v, op, isRowSafe);
        }

        @Override
        public AColGroup call() {
            return this._group.binaryRowOpLeft(this._op, this._v, this._isRowSafe);
        }
    }

    private static abstract class BinaryMVRowTask
    implements Callable<AColGroup> {
        protected final AColGroup _group;
        protected final double[] _v;
        protected final BinaryOperator _op;
        protected final boolean _isRowSafe;

        protected BinaryMVRowTask(AColGroup group, double[] v, BinaryOperator op, boolean isRowSafe) {
            this._group = group;
            this._v = v;
            this._op = op;
            this._isRowSafe = isRowSafe;
        }
    }

    private static class BinaryMMTask
    implements Callable<Long> {
        private final int _rl;
        private final int _ru;
        private final CompressedMatrixBlock _m1;
        private final MatrixBlock _m2;
        private final MatrixBlock _ret;
        private final boolean _left;
        private final BinaryOperator _op;

        protected BinaryMMTask(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, BinaryOperator op, boolean left) {
            this._m1 = m1;
            this._m2 = m2;
            this._ret = ret;
            this._op = op;
            this._rl = rl;
            this._ru = ru;
            this._left = left;
        }

        @Override
        public Long call() {
            List<AColGroup> groups = this._m1.getColGroups();
            int _blklen = Math.max(16384 / this._ret.getNumColumns() / groups.size(), 64);
            AIterator[] its = CLALibBinaryCellOp.getIterators(groups, this._rl);
            long nnz = 0L;
            for (int r = this._rl; r < this._ru; r += _blklen) {
                int re = Math.min(r + _blklen, this._ru);
                this.processBlock(r, re, groups, its);
                nnz += this._ret.recomputeNonZeros(r, re - 1);
            }
            return nnz;
        }

        private final void processBlock(int rl, int ru, List<AColGroup> groups, AIterator[] its) {
            DenseBlock db = this._ret.getDenseBlock();
            CLALibBinaryCellOp.decompressToSubBlock(rl, ru, db, groups, its);
            if (this._left) {
                this.processLeft(rl, ru);
            } else {
                this.processRight(rl, ru);
            }
        }

        private final void processLeft(int rl, int ru) {
            if (this._m2.isInSparseFormat()) {
                this.processLeftSparse(rl, ru);
            } else {
                this.processLeftDense(rl, ru);
            }
        }

        private final void processLeftSparse(int rl, int ru) {
            DenseBlock rv = this._ret.getDenseBlock();
            int cols = this._ret.getNumColumns();
            SparseBlock m2sb = this._m2.getSparseBlock();
            for (int r = rl; r < ru; ++r) {
                double[] retV = rv.values(r);
                int off = rv.pos(r);
                if (m2sb.isEmpty(r)) {
                    for (int c = off; c < cols + off; ++c) {
                        retV[c] = this._op.fn.execute(0.0, retV[c]);
                    }
                    continue;
                }
                int apos = m2sb.pos(r);
                int alen = m2sb.size(r) + apos;
                int[] aix = m2sb.indexes(r);
                double[] avals = m2sb.values(r);
                int j = 0;
                int k = apos;
                while (j < cols && k < alen) {
                    double v = aix[k] == j ? avals[k++] : 0.0;
                    retV[off] = this._op.fn.execute(v, retV[off]);
                    ++j;
                    ++off;
                }
                while (j < cols) {
                    retV[off] = this._op.fn.execute(0.0, retV[off]);
                    ++j;
                    ++off;
                }
            }
        }

        private final void processLeftDense(int rl, int ru) {
            DenseBlock rv = this._ret.getDenseBlock();
            int cols = this._ret.getNumColumns();
            DenseBlock m2db = this._m2.getDenseBlock();
            for (int r = rl; r < ru; ++r) {
                int off;
                double[] retV = rv.values(r);
                double[] m2V = m2db.values(r);
                for (int c = off = rv.pos(r); c < cols + off; ++c) {
                    retV[c] = this._op.fn.execute(m2V[c], retV[c]);
                }
            }
        }

        private final void processRight(int rl, int ru) {
            if (this._m2.isInSparseFormat()) {
                this.processRightSparse(rl, ru);
            } else {
                this.processRightDense(rl, ru);
            }
        }

        private final void processRightSparse(int rl, int ru) {
            DenseBlock rv = this._ret.getDenseBlock();
            int cols = this._ret.getNumColumns();
            SparseBlock m2sb = this._m2.getSparseBlock();
            for (int r = rl; r < ru; ++r) {
                double[] retV = rv.values(r);
                int off = rv.pos(r);
                if (m2sb.isEmpty(r)) {
                    for (int c = off; c < cols + off; ++c) {
                        retV[c] = this._op.fn.execute(retV[c], 0.0);
                    }
                    continue;
                }
                int apos = m2sb.pos(r);
                int alen = m2sb.size(r) + apos;
                int[] aix = m2sb.indexes(r);
                double[] avals = m2sb.values(r);
                int j = 0;
                int k = apos;
                while (j < cols && k < alen) {
                    double v = aix[k] == j ? avals[k++] : 0.0;
                    retV[off] = this._op.fn.execute(retV[off], v);
                    ++j;
                    ++off;
                }
                while (j < cols) {
                    retV[off] = this._op.fn.execute(retV[off], 0.0);
                    ++j;
                    ++off;
                }
            }
        }

        private final void processRightDense(int rl, int ru) {
            DenseBlock rv = this._ret.getDenseBlock();
            int cols = this._ret.getNumColumns();
            DenseBlock m2db = this._m2.getDenseBlock();
            for (int r = rl; r < ru; ++r) {
                int off;
                double[] retV = rv.values(r);
                double[] m2V = m2db.values(r);
                for (int c = off = rv.pos(r); c < cols + off; ++c) {
                    retV[c] = this._op.fn.execute(retV[c], m2V[c]);
                }
            }
        }
    }

    private static class BinaryMVColTaskSparse
    implements Callable<Long> {
        private final int _rl;
        private final int _ru;
        private final CompressedMatrixBlock _m1;
        private final MatrixBlock _m2;
        private final MatrixBlock _ret;
        private final BinaryOperator _op;
        private MatrixBlock tmp;
        private boolean _left;

        protected BinaryMVColTaskSparse(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, BinaryOperator op, boolean left) {
            this._m1 = m1;
            this._m2 = m2;
            this._ret = ret;
            this._op = op;
            this._rl = rl;
            this._ru = ru;
            this._left = left;
        }

        @Override
        public Long call() {
            this.tmp = CLALibBinaryCellOp.allocateTempUncompressedBlock(this._m1.getNumColumns());
            int _blklen = this.tmp.getNumRows();
            List<AColGroup> groups = this._m1.getColGroups();
            AIterator[] its = CLALibBinaryCellOp.getIterators(groups, this._rl);
            if (!this._left) {
                for (int r = this._rl; r < this._ru; r += _blklen) {
                    this.processBlock(r, Math.min(r + _blklen, this._ru), groups, its);
                }
            } else {
                for (int r = this._rl; r < this._ru; r += _blklen) {
                    this.processBlockLeft(r, Math.min(r + _blklen, this._ru), groups, its);
                }
            }
            return this._ret.recomputeNonZeros(this._rl, this._ru - 1);
        }

        private final void processBlock(int rl, int ru, List<AColGroup> groups, AIterator[] its) {
            CLALibBinaryCellOp.decompressToTmpBlock(rl, ru, this.tmp.getDenseBlock(), groups, its);
            this.processDense(rl, ru);
            this.tmp.reset();
        }

        private final void processBlockLeft(int rl, int ru, List<AColGroup> groups, AIterator[] its) {
            CLALibBinaryCellOp.decompressToTmpBlock(rl, ru, this.tmp.getDenseBlock(), groups, its);
            this.processDenseLeft(rl, ru);
            this.tmp.reset();
        }

        private final void processDense(int rl, int ru) {
            int nCol = this._m1.getNumColumns();
            SparseBlock sb = this._ret.getSparseBlock();
            double[] _tmpDense = this.tmp.getDenseBlockValues();
            double[] _m2Dense = this._m2.getDenseBlockValues();
            for (int row = rl; row < ru; ++row) {
                double vr = _m2Dense[row];
                int tmpOff = (row - rl) * nCol;
                for (int col = 0; col < nCol; ++col) {
                    sb.append(row, col, this._op.fn.execute(_tmpDense[tmpOff + col], vr));
                }
            }
        }

        private final void processDenseLeft(int rl, int ru) {
            int nCol = this._m1.getNumColumns();
            SparseBlock sb = this._ret.getSparseBlock();
            double[] _tmpDense = this.tmp.getDenseBlockValues();
            double[] _m2Dense = this._m2.getDenseBlockValues();
            for (int row = rl; row < ru; ++row) {
                double vr = _m2Dense[row];
                int tmpOff = (row - rl) * nCol;
                for (int col = 0; col < nCol; ++col) {
                    sb.append(row, col, this._op.fn.execute(vr, _tmpDense[tmpOff + col]));
                }
            }
        }
    }

    private static class BinaryMVColTaskDense
    implements Callable<Long> {
        private final int _rl;
        private final int _ru;
        private final CompressedMatrixBlock _m1;
        private final MatrixBlock _m2;
        private final MatrixBlock _ret;
        private final BinaryOperator _op;
        private boolean _left;

        protected BinaryMVColTaskDense(CompressedMatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, BinaryOperator op, boolean left) {
            this._m1 = m1;
            this._m2 = m2;
            this._ret = ret;
            this._op = op;
            this._rl = rl;
            this._ru = ru;
            this._left = left;
        }

        @Override
        public Long call() {
            int _blklen = Math.max(16384 / this._ret.getNumColumns(), 64);
            List<AColGroup> groups = this._m1.getColGroups();
            AIterator[] its = CLALibBinaryCellOp.getIterators(groups, this._rl);
            if (!this._left) {
                for (int r = this._rl; r < this._ru; r += _blklen) {
                    this.processBlock(r, Math.min(r + _blklen, this._ru), groups, its);
                }
            } else {
                for (int r = this._rl; r < this._ru; r += _blklen) {
                    this.processBlockLeft(r, Math.min(r + _blklen, this._ru), groups, its);
                }
            }
            return this._ret.recomputeNonZeros(this._rl, this._ru - 1);
        }

        private final void processBlock(int rl, int ru, List<AColGroup> groups, AIterator[] its) {
            DenseBlock db = this._ret.getDenseBlock();
            CLALibBinaryCellOp.decompressToSubBlock(rl, ru, db, groups, its);
            this.processGenericDense(rl, ru);
        }

        private final void processBlockLeft(int rl, int ru, List<AColGroup> groups, AIterator[] its) {
            DenseBlock db = this._ret.getDenseBlock();
            CLALibBinaryCellOp.decompressToSubBlock(rl, ru, db, groups, its);
            this.processGenericDenseLeft(rl, ru);
        }

        private final void processGenericDense(int rl, int ru) {
            int ncol = this._m1.getNumColumns();
            DenseBlock rd = this._ret.getDenseBlock();
            double[] _m2Dense = this._m2.getDenseBlockValues();
            for (int row = rl; row < ru; ++row) {
                double[] retDense = rd.values(row);
                int posR = rd.pos(row);
                double vr = _m2Dense[row];
                this.processRow(ncol, retDense, posR, vr);
            }
        }

        private final void processGenericDenseLeft(int rl, int ru) {
            int ncol = this._m1.getNumColumns();
            DenseBlock rd = this._ret.getDenseBlock();
            double[] _m2Dense = this._m2.getDenseBlockValues();
            for (int row = rl; row < ru; ++row) {
                double[] retDense = rd.values(row);
                int posR = rd.pos(row);
                double vr = _m2Dense[row];
                this.processRowLeft(ncol, retDense, posR, vr);
            }
        }

        private void processRow(int ncol, double[] ret, int posR, double vr) {
            for (int col = 0; col < ncol; ++col) {
                ret[posR + col] = this._op.fn.execute(ret[posR + col], vr);
            }
        }

        private void processRowLeft(int ncol, double[] ret, int posR, double vr) {
            for (int col = 0; col < ncol; ++col) {
                ret[posR + col] = this._op.fn.execute(vr, ret[posR + col]);
            }
        }
    }

    private static class BinaryMVColTaskCompressed
    implements Callable<Long> {
        private final int _rl;
        private final int _ru;
        private final CompressedMatrixBlock _m1;
        private final MatrixBlock _m2;
        private final int[] _ret;
        private final BinaryOperator _op;
        private final ValueComparisonFunction _compFn;
        private final boolean _left;
        private MatrixBlock tmp;

        protected BinaryMVColTaskCompressed(CompressedMatrixBlock m1, MatrixBlock m2, int rl, int ru, BinaryOperator op, boolean left) {
            this._m1 = m1;
            this._m2 = m2;
            this._op = op;
            this._rl = rl;
            this._ru = ru;
            this._ret = new int[ru - rl];
            this._compFn = (ValueComparisonFunction)op.fn;
            this._left = left;
        }

        @Override
        public Long call() {
            this.tmp = CLALibBinaryCellOp.allocateTempUncompressedBlock(this._m1.getNumColumns());
            int _blklen = this.tmp.getNumRows();
            List<AColGroup> groups = this._m1.getColGroups();
            AIterator[] its = CLALibBinaryCellOp.getIterators(groups, this._rl);
            long nnz = 0L;
            if (!this._left) {
                int rl = this._rl;
                int retIxOff = 0;
                while (rl < this._ru) {
                    int ru = Math.min(rl + _blklen, this._ru);
                    CLALibBinaryCellOp.decompressToTmpBlock(rl, ru, this.tmp.getDenseBlock(), groups, its);
                    nnz += this.processDense(rl, ru, retIxOff);
                    this.tmp.reset();
                    rl += _blklen;
                    retIxOff += _blklen;
                }
            } else {
                int rl = this._rl;
                int retIxOff = 0;
                while (rl < this._ru) {
                    int ru = Math.min(rl + _blklen, this._ru);
                    CLALibBinaryCellOp.decompressToTmpBlock(rl, ru, this.tmp.getDenseBlock(), groups, its);
                    nnz += this.processDenseLeft(rl, ru, retIxOff);
                    this.tmp.reset();
                    rl += _blklen;
                    retIxOff += _blklen;
                }
            }
            return nnz;
        }

        private final long processDense(int rl, int ru, int retIxOffset) {
            int nCol = this._m1.getNumColumns();
            double[] _tmpDense = this.tmp.getDenseBlockValues();
            double[] _m2Dense = this._m2.getDenseBlockValues();
            long nnz = 0L;
            int row = rl;
            int retIx = retIxOffset;
            while (row < ru) {
                double vr = _m2Dense[row];
                int tmpOff = (row - rl) * nCol;
                int indicatorVector = 0;
                for (int col = 0; col < nCol; ++col) {
                    indicatorVector <<= 1;
                    int indicator = this._compFn.compare(_tmpDense[tmpOff + col], vr) ? 1 : 0;
                    indicatorVector += indicator;
                    nnz += (long)indicator;
                }
                this._ret[retIx] = indicatorVector;
                ++row;
                ++retIx;
            }
            return nnz;
        }

        private final long processDenseLeft(int rl, int ru, int retIxOffset) {
            int nCol = this._m1.getNumColumns();
            double[] _tmpDense = this.tmp.getDenseBlockValues();
            double[] _m2Dense = this._m2.getDenseBlockValues();
            long nnz = 0L;
            int row = rl;
            int retIx = retIxOffset;
            while (row < ru) {
                double vr = _m2Dense[row];
                int tmpOff = (row - rl) * nCol;
                int indicatorVector = 0;
                for (int col = 0; col < nCol; ++col) {
                    indicatorVector <<= 1;
                    int indicator = this._compFn.compare(vr, _tmpDense[tmpOff + col]) ? 1 : 0;
                    indicatorVector += indicator;
                    nnz += (long)indicator;
                }
                this._ret[retIx] = indicatorVector;
                ++row;
                ++retIx;
            }
            return nnz;
        }
    }
}

