/*******************************************************************************
* Copyright 2023 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

//@HEADER
// ***************************************************
//
// HPCG: High Performance Conjugate Gradient Benchmark
//
// Contact:
// Michael A. Heroux ( maherou@sandia.gov)
// Jack Dongarra     (dongarra@eecs.utk.edu)
// Piotr Luszczek    (luszczek@eecs.utk.edu)
//
// ***************************************************
//@HEADER

/*!
 @file TestCustomKernel.cpp
 HPCG routine
 */




#include <sycl/sycl.hpp>
#include <fstream>
#include <iostream>
#include <vector>
#include "hpcg.hpp"
#include "WriteProblem.hpp"
#include "mytimer.hpp"

#include "TestCustomKernels.hpp"
#include "kernels/axpby_kernel.hpp"

#ifndef HPCG_NO_MPI
#include "ExchangeHalo.hpp"
#include <mpi.h>
#include <cstdlib>
#endif

#include <cmath>  // for convert_to_gflops()
#include <cstdio> // for convert_to_gflops()

#if defined(HPCG_USE_CUSTOM_KERNELS) && defined(HPCG_TEST_CUSTOM_KERNELS)

#include "CustomKernels.hpp"
#include "ComputeDotProduct.hpp"
#include "ComputeSPMV.hpp"
#include "ComputeSYMGS.hpp"

#include "VeryBasicProfiler.hpp"

#ifdef BASIC_PROFILING
#define BEGIN_PROFILE(n) optData->profiler->begin(n);
#define END_PROFILE(n) optData->profiler->end(n);
#define END_PROFILE_WAIT(n, event) event.wait(); optData->profiler->end(n);
#else
#define BEGIN_PROFILE(n)
#define END_PROFILE(n)
#define END_PROFILE_WAIT(n, event)
#endif


// =====================================================================
// ====  Master Switches for Functional and Performance Test Runs  =====
// =====================================================================

// perf warmup/run counts
#define WARMUP_RUNS 2
#define TIMED_RUNS 20

// toggle functional testing
#define TEST_FUNCTIONAL

// toggle performance testing (and modifier to measure oneMKL or not)
#define TEST_PERFORMANCE
//#define RUN_ONEMKL_PERF_IF_AVAILABLE

// select which functionality to run functional and/or performance testing on
#define TEST_AXPBY
#define TEST_DOT
#define TEST_SPGEMV
#define TEST_SPGEMV_DOT
#define TEST_SPTRMVL
#define TEST_SPTRMVU
#define TEST_SPTRSVL
#define TEST_SPTRSVU
#define TEST_SYMGS
#define TEST_SYMGS_MV

// =====================================================================
// =====================================================================



namespace {

    #define TOL  1.0e-9

    namespace sparse = oneapi::mkl::sparse;
    namespace mkl = oneapi::mkl;

    void check_arrays(sycl::queue &queue, local_int_t length, double *ref, double *test, local_int_t *tmp_dev, local_int_t *tmp_host, const std::vector<sycl::event> &dependencies)
    {
        auto ev_reset = queue.fill(tmp_dev, 0, 1, dependencies);
        auto ev_check = queue.submit([&](sycl::handler &cgh) {
            cgh.depends_on(ev_reset);
            auto kernel = [=](sycl::item<1> row) {
                double diff = (ref[row] - test[row]);
                if (diff > TOL || diff < -TOL) {
                    tmp_dev[0] = 1;
                }
            };
            cgh.parallel_for<class test_custom_kernel_check_arrays>(sycl::range<1>(length), kernel);
        });
        queue.memcpy(tmp_host, tmp_dev, 1*sizeof(local_int_t), ev_check).wait();
    }



    void check_and_print_arrays(sycl::queue &queue, local_int_t length, double *ref, double *test, local_int_t *tmp_dev, local_int_t *tmp_host, const std::vector<sycl::event> &dependencies)
    {
        auto ev_reset = queue.fill(tmp_dev, 0, 1, dependencies);
        auto ev_check = queue.submit([&](sycl::handler &cgh) {
            cgh.depends_on(ev_reset);
            auto kernel = [=](sycl::item<1> item) {
                int row = item.get_id(0);
                double diff = (ref[row] - test[row]);
                if (diff > TOL || diff < -TOL) {
                    tmp_dev[0] = 1;
                    if (row < 1000)
                        sycl::ext::oneapi::experimental::printf("row %d: diff = %3.7f,  ref = %g,  test = %g\n", row, diff, ref[row], test[row]);
                }

//                if (row < 10 )
//                    sycl::ext::oneapi::experimental::printf("row %d: diff = %3.7f,  ref = %g,  test = %g\n", row, diff, ref[row], test[row]);
            };
            cgh.parallel_for<class test_custom_kernel_check_and_print_arrays>(sycl::range<1>(length), kernel);
        });
        queue.memcpy(tmp_host, tmp_dev, 1*sizeof(local_int_t), ev_check).wait();
    }


} // anonymous namespace




int TestCustomKernels(SparseMatrix &A, Vector &b, Vector &x, int rank, TestCustomKernelsData &testck_data, sycl::queue &queue)
{

    sycl::event ev_test, ev_ref, ev_update, ev_run;

    double start_time = 0.0, wall_time = 0.0, ave_time = 0.0, gflops = 0.0;

    testck_data.count_fail = 0;

    const local_int_t nRows = A.localNumberOfRows;
    const local_int_t nCols = A.localNumberOfColumns;
    struct optData *optData = (struct optData *)A.optimizationData;

    custom::sparseMatrix *sparseM = (custom::sparseMatrix *)optData->esbM;

    Vector r, w, y, y1, z, Ay, Az;

    InitializeVectorDevice(r, nCols, queue);
    InitializeVectorDevice(w, nCols, queue);
    InitializeVectorDevice(y, nCols, queue);
    InitializeVectorDevice(y1, nCols, queue);
    InitializeVectorDevice(z, nCols, queue);
    InitializeVectorDevice(Ay, nRows, queue);
    InitializeVectorDevice(Az, nRows, queue);

    double *rv = r.values;
    double *wv = w.values;
    double *yv = y.values;
    double *y1v = y1.values;
    double *zv = z.values;
    double *Ayv = Ay.values;
    double *Azv = Az.values;


    int ierr;
    double *fp_dev = (double *)sparse_malloc_device(1 * sizeof(double), queue);
    double *fp_host = (double *)sparse_malloc_host(1 * sizeof(double), queue);
    local_int_t *tmp_dev = (local_int_t *)sparse_malloc_device(1 * sizeof(local_int_t), queue);
    local_int_t *tmp_host = (local_int_t *)sparse_malloc_host(1 * sizeof(local_int_t), queue);
    double *tmp2_dev = (double *)sparse_malloc_device(nRows * sizeof(double), queue);
    if ( fp_dev == NULL || tmp_dev == NULL || tmp_host == NULL || tmp2_dev == NULL){
        std::cerr << "rank " << rank << ": error in TestCustomKernels allocation" << std::endl;
        return 1;
    }


    local_int_t nRows_b = optData->nrow_b;
    local_int_t *bmap = optData->bmap;

    using val_t = double;
    constexpr std::uint64_t seed = 0; // 777;
    oneapi::mkl::rng::philox4x32x10 engine(queue, seed);
    oneapi::mkl::rng::uniform<val_t> distribution(static_cast<val_t>(-1.0), static_cast<val_t>(1.0));

    oneapi::mkl::rng::generate(distribution, engine, nCols, rv, {}).wait();
    oneapi::mkl::rng::generate(distribution, engine, nCols, wv, {}).wait();
//    queue.fill(wv, 0.01, nRows).wait();

    queue.fill(yv, 0.0, nCols).wait();
    queue.fill(y1v, 0.0, nCols).wait();
    queue.fill(zv, 0.0, nCols).wait();
    queue.fill(Ayv, 0.0, nRows).wait();
    queue.fill(Azv, 0.0, nRows).wait();

    queue.fill(tmp_dev, 0, 1).wait();

    // copy over nnz for performance tests
    queue.memcpy(tmp_host, optData->ia + nRows, 1*sizeof(local_int_t)).wait();
    const double  nnz = static_cast<double>(tmp_host[0]);


    // decide whether to create onemkl sparse handles or not
#if defined(TEST_FUNCTIONAL) || ( defined(TEST_PERFORMANCE) && defined(RUN_ONEMKL_PERF_IF_AVAILABLE))

    //
    // setup onemkl objects for comparisons
    //

    sparse::matrix_handle_t hMatrixA;
    sparse::matrix_handle_t hMatrixB;

    sparse::init_matrix_handle(&hMatrixA);
    sparse::init_matrix_handle(&hMatrixB);


    sparse::set_csr_data(queue, hMatrixA, nRows, nRows, mkl::index_base::zero,
                         optData->ia, optData->ja, optData->a, {}).wait();
#ifndef HPCG_NO_MPI
    sparse::set_csr_data(queue, hMatrixB, optData->nrow_b, nCols, mkl::index_base::zero,
                         optData->ib, optData->jb, optData->b).wait();
#endif

    sparse::set_matrix_property(hMatrixA, sparse::property::symmetric);
    sparse::set_matrix_property(hMatrixA, sparse::property::sorted);

    sparse::optimize_gemv(queue, mkl::transpose::nontrans, hMatrixA, {}).wait();
    if (A.geom->size > 1) {
        sparse::optimize_gemv(queue, mkl::transpose::nontrans, hMatrixB, {}).wait();
    }

    // call the optimize steps for everything related to onemkl algorithms being tested
    sparse::optimize_trsv(queue, oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, hMatrixA, {}).wait();
    sparse::optimize_trsv(queue, oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, hMatrixA, {}).wait();
    sparse::optimize_trmv(queue, oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, hMatrixA, {}).wait();
    sparse::optimize_trmv(queue, oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit, hMatrixA, {}).wait();

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif


#endif  // defined(TEST_FUNCTIONAL) || ( defined(TEST_PERFORMANCE) && defined(RUN_ONEMKL_PERF_IF_AVAILABLE))



#ifdef TEST_FUNCTIONAL

    if (rank == 0) {
        std::cout << "Starting Test Custom Kernel Functional Validation Suite:" << std::endl;
    }


#ifdef TEST_AXPBY

    oneapi::mkl::rng::generate(distribution, engine, nCols, yv, {}).wait();
    queue.memcpy(zv, yv, sizeof(double) * nCols).wait();

    double alpha = 1.23, beta = 4.56;
    ev_test = queue.submit([&](sycl::handler &cgh) {

        // AXPBY esimd kernel parameters
        constexpr local_int_t block_size = 16;
        const local_int_t nWG = 8;
        constexpr local_int_t uroll = 4;
        local_int_t nBlocks = ceil_div(nCols, uroll * block_size);

        auto kernel = [=] (sycl::nd_item<1> item) SYCL_ESIMD_KERNEL {
            axpby_body<block_size, uroll>(item, wv, yv, alpha, beta, nCols, nBlocks);
        };
        cgh.parallel_for<class axbpy_esimd>(sycl::nd_range<1>(ceil_div(nBlocks, nWG) * nWG, nWG), kernel);
    });
    
    // reference code for axpby
    ev_ref = queue.submit([&](sycl::handler &cgh) {
        cgh.depends_on(ev_test);
        auto kernel = [=](sycl::item<1> item) {
            const local_int_t i = item.get_id(0);
            zv[i] = alpha * wv[i] +  beta * zv[i];
        };
        cgh.parallel_for<class test_axpby>( sycl::range<1>(nCols), kernel);
    });

    check_arrays(queue, nCols, zv, yv, tmp_dev, tmp_host, {ev_ref});
    
    if (tmp_host[0] == 1) {
        std::cout << "\trank " << rank << ": AXPBY      \t\t\tverification failed" << std::endl;
        testck_data.count_fail++;
    }
    else {
        std::cout << "\trank " << rank << ": AXPBY      \t\t\tverification passed" << std::endl;
        testck_data.count_pass++;
    }    queue.wait();

#endif //TEST_AXPBY
    

#ifdef TEST_DOT
    // test ComputeDotProductLocal  rdotw_test = dot(r,w)
    ev_test = ComputeDotProductLocal(nRows, r, w, fp_dev, queue, {});
    queue.memcpy(fp_host, fp_dev, 1*sizeof(double), ev_test).wait();
    double rdotw_test = fp_host[0];

    // reference code for  rdotw_ref = dot(r,w)
    ev_ref = queue.submit([&](sycl::handler &cgh) {
        cgh.depends_on(ev_test);
        auto sumReducer = sycl::reduction(fp_dev, sycl::plus<>(), sycl::property::reduction::initialize_to_identity());
        auto kernel = [=](sycl::item<1> item, auto &sumDot){
            const local_int_t i = item.get_id(0);
            sumDot += wv[i] * rv[i];
        };
        cgh.parallel_for<class test_dotproduct>( sycl::range<1>(nRows), sumReducer, kernel);
    });
    queue.memcpy(fp_host, fp_dev, 1*sizeof(double), ev_ref).wait();
    double rdotw_ref = fp_host[0];

    if (std::fabs(rdotw_test - rdotw_ref) > TOL) {
        std::cout << "\trank " << rank << ": Dot Product \t\t\tverification failed" << std::endl;
    }
    else {
        std::cout << "\trank " << rank << ": Dot Product \t\t\tverification passed" << std::endl;
    }
    queue.wait();

#endif //TEST_DOT





#ifdef TEST_SPGEMV
    //
    // Test GEMV with A via ComputeSPMV_DOT
    //

    // yv = (A + B) * wv; fp_dev = dot(yv, wv) custom
//    ev_test = custom::SpGEMV(queue, sparseM, wv, yv, {});
    //ev_test = ComputeSPMV(A, w, y, queue, ierr, {});
    // yv <- A*wv, fp_dev <- wv*A*wv
    ev_test = custom::SpGEMV_DOT(queue, sparseM, wv, yv, fp_dev, {ev_test});
    queue.memcpy(fp_host, fp_dev, 1*sizeof(double), ev_test).wait();
    double ydotw = fp_host[0];

    // zv = A * wv; fp_dev = dot(zv, wv) onemkl
    ev_ref = sparse::gemv(queue, oneapi::mkl::transpose::nontrans, 1.0, hMatrixA, wv, 0.0, zv, {ev_test});
    if (A.geom->size > 1) {
        // tmp2_dev = B * wv; onemkl
        ev_ref = sparse::gemv(queue, oneapi::mkl::transpose::nontrans, 1.0, hMatrixB, wv, 0.0, tmp2_dev, {ev_ref});
        // zv += tmp2_dev(bmap)
        ev_ref = queue.submit([&](sycl::handler &cgh) {
            cgh.depends_on(ev_ref);
            auto kernel = [=](sycl::item<1> item){
                const local_int_t row = item.get_id(0);
                zv[bmap[row]] += tmp2_dev[row];
            };
            cgh.parallel_for<class test_gemvB_update>( sycl::range<1>(nRows_b), kernel);
        });
    }
    ev_ref = ComputeDotProductLocal(nRows, z, w, fp_dev, queue, {ev_ref});
    queue.memcpy(fp_host, fp_dev, 1*sizeof(double), ev_ref).wait();
    double zdotw = fp_host[0];

//    check_and_print_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    check_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    if (tmp_host[0] == 1) {
        std::cout << "\trank " << rank << ": SpGEMV A+B \t\t\tverification failed" << std::endl;
        testck_data.count_fail++;
    }
    else {
        std::cout << "\trank " << rank << ": SpGEMV A+B \t\t\tverification passed" << std::endl;
        testck_data.count_pass++;
    }

    if (std::fabs(ydotw - zdotw) > TOL * nRows) {
        std::cout << "\trank " << rank << ": SpGEMV Dot \t\t\tverification failed" << std::endl;
        //printf("ref = %lf, opt = %lf, diff = %0.14lf\n", zdotw, ydotw, std::fabs(ydotw - zdotw));
    }
    else {
        std::cout << "\trank " << rank << ": SpGEMV Dot \t\t\tverification passed" << std::endl;
    }
    queue.wait();

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // TEST_SPGEMV


#ifdef TEST_SPTRMVL
    //
    // Test SpTRMV Lower + Diagonal + Nonlocal + Update
    //

    ZeroVector(y, queue, {}).wait();
    ZeroVector(y1, queue, {}).wait();
    ZeroVector(z, queue, {}).wait();

    // yv = yv + (L+B) * wv;  custom, y1v not used
    ev_test = custom::SpTRMV(queue, sparseM, custom::uplo::lower_update, wv, rv, yv, y1v, {});
    // zv = zv + (L+I) * wv;  onemkl
    ev_ref = sparse::trmv(queue, oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::unit,
                          1.0, hMatrixA, wv, 1.0, zv, {ev_test});

    // Subtract off unit diagonals
    ev_ref = queue.submit([&](sycl::handler &cgh) {
        cgh.depends_on(ev_ref);
        auto kernel = [=](sycl::item<1> item){
            const local_int_t row = item.get_id(0);
            zv[row] -= wv[row];
        };
        cgh.parallel_for<class test_trmvL_gemvB_update>( sycl::range<1>(nRows), kernel);
    });

    if (A.geom->size > 1) {
        // tmp2_dev = B * wv
        ev_ref = sparse::gemv(queue, oneapi::mkl::transpose::nontrans, 1.0, hMatrixB, wv, 0.0, tmp2_dev, {ev_ref});
        // zv += tmp2_dev(bmap)
        ev_ref = queue.submit([&](sycl::handler &cgh) {
            cgh.depends_on(ev_ref);
            auto kernel = [=](sycl::item<1> item){
                const local_int_t row = item.get_id(0);
                zv[bmap[row]] += tmp2_dev[row];
            };
            cgh.parallel_for<class test_trmvL_sub_diag>( sycl::range<1>(nRows_b), kernel);
        });
    }

//    check_and_print_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    check_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    if (tmp_host[0] == 1) {
        std::cout << "\trank " << rank << ": SpTRMV L+B+update \t\tverification failed" << std::endl;
        testck_data.count_fail++;
    }
    else {
        std::cout << "\trank " << rank << ": SpTRMV L+B+update \t\tverification passed" << std::endl;
        testck_data.count_pass++;
    }
    queue.wait();

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // TEST_SPTRMVL



#ifdef TEST_SPTRMVU
    //
    // Test SpTRMV Upper + Nonlocal
    //

    // yv = (U + B) * wv;  custom
    ev_test = custom::SpTRMV(queue, sparseM, custom::uplo::upper_nonlocal, wv, rv, yv, y1v, {});
    // zv = (D+U) * wv;  onemkl
    ev_ref = sparse::trmv(queue, oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit,
                          1.0, hMatrixA, wv, 0.0, zv, {ev_test});
    if (A.geom->size > 1) {
        // tmp2_dev = B * wv
        ev_ref = sparse::gemv(queue, oneapi::mkl::transpose::nontrans, 1.0, hMatrixB, wv, 0.0, tmp2_dev, {ev_ref});
        // zv += tmp2_dev(bmap)
        ev_ref = queue.submit([&](sycl::handler &cgh) {
            cgh.depends_on(ev_ref);
            auto kernel = [=](sycl::item<1> item){
                const local_int_t row = item.get_id(0);
                zv[bmap[row]] += tmp2_dev[row];
            };
            cgh.parallel_for<class test_trmvU_gemvB_update>( sycl::range<1>(nRows_b), kernel);
        });
    }
    // zv = zv - D * wv
    ev_update = queue.submit([&](sycl::handler &cgh) {
            cgh.depends_on(ev_ref);
            double *diags = sparseM->diags;
            auto kernel = [=](sycl::item<1> item){
                const local_int_t row = item.get_id(0);
                zv[row] -= diags[row] * wv[row];
                zv[row] = rv[row] - zv[row];
            };
            cgh.parallel_for<class test_trmvU_update_diag>(
                    sycl::range<1>(nRows), kernel);
        });

//    check_and_print_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_update});
    check_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_update});
    if (tmp_host[0] == 1) {
        std::cout << "\trank " << rank << ": SpTRMV r-(U+B) \t\t\tverification failed" << std::endl;
        testck_data.count_fail++;
    }
    else {
        std::cout << "\trank " << rank << ": SpTRMV r-(U+B) \t\t\tverification passed" << std::endl;
        testck_data.count_pass++;
    }
    queue.wait();

    //
    // Test SpTRMV Upper
    //
    if (A.geom->size > 1) { // Convert zv back to Ux
        // remove r component
        auto evt = queue.submit([&](sycl::handler &cgh) {
            auto kernel = [=](sycl::item<1> item){
                const local_int_t row = item.get_id(0);
                zv[row] = rv[row] - zv[row];
            };
            cgh.parallel_for<class test_remove_r>(
                    sycl::range<1>(nRows), kernel);
        });

        // remove Bx component
        ev_ref = queue.submit([&](sycl::handler &cgh) {
            cgh.depends_on(evt);
            auto kernel = [=](sycl::item<1> item){
                const local_int_t row = item.get_id(0);
                zv[bmap[row]] -= tmp2_dev[row];
            };
            cgh.parallel_for<class test_trmvU_update>( sycl::range<1>(nRows_b), kernel);
        });

        check_arrays(queue, nRows, zv, y1v, tmp_dev, tmp_host, {ev_ref});
        if (tmp_host[0] == 1) {
            std::cout << "\trank " << rank << ": SpTRMV U \t\t\tverification failed" << std::endl;
            testck_data.count_fail++;
        }
        else {
            std::cout << "\trank " << rank << ": SpTRMV U \t\t\tverification passed" << std::endl;
            testck_data.count_pass++;
        }
        queue.wait();
    }

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // TEST_SPTRMVU


#ifdef TEST_SPTRSVL

    //
    // Test TRSV Lower
    //

    // solve (L+D) * yv = wv;  custom
    ev_test = custom::SpTRSV(queue, sparseM, custom::uplo::lower_diagonal, wv, yv, {});

    // solve (L+D) * zv = wv;  onemkl
    ev_ref = sparse::trsv(queue, oneapi::mkl::uplo::lower,  oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit,
                          hMatrixA, wv, zv, {ev_test});

//    check_and_print_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    check_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    if (tmp_host[0] == 1) {
        std::cout << "\trank " << rank << ": SpTRSV FWD \t\t\tverification failed" << std::endl;
        testck_data.count_fail++;
    }
    else {
        std::cout << "\trank " << rank << ": SpTRSV FWD \t\t\tverification passed" << std::endl;
        testck_data.count_pass++;
    }
    queue.wait();

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // TEST_SPTRSVL


#ifdef TEST_SPTRSVU



    //
    // Test TRSV Upper
    //
    // solve (L+D) * yv = wv;  custom
    ev_test = custom::SpTRSV(queue, sparseM, custom::uplo::upper_diagonal, wv, yv, {});

    // solve (L+D) * zv = wv;  onemkl
    ev_ref = sparse::trsv(queue, oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit,
                          hMatrixA, wv, zv, {ev_test});

//    check_and_print_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    check_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    if (tmp_host[0] == 1) {
        std::cout << "\trank " << rank << ": SpTRSV BWD \t\t\tverification failed" << std::endl;
        testck_data.count_fail++;
    }
    else {
        std::cout << "\trank " << rank << ": SpTRSV BWD \t\t\tverification passed" << std::endl;
        testck_data.count_pass++;
    }
    queue.wait();

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // TEST_SPTRSVU


#ifdef TEST_SYMGS


    //
    // Test SYMGS
    //

    // SYMGS input scaling
    val_t max_scale = 0.01;
    oneapi::mkl::rng::uniform<val_t> distribution2(-max_scale, max_scale);
    oneapi::mkl::rng::generate(distribution2, engine, nRows, yv, {}).wait();
    queue.memcpy(zv, yv, nRows*sizeof(double)).wait();

    ev_test = run_SYMGS_custom(queue, A, optData, sparseM, w, y, {});
    ev_ref = run_SYMGS_onemkl(queue, A, optData, hMatrixA, hMatrixB, w, z, {ev_test});

//    check_and_print_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    check_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    if (tmp_host[0] == 1) {
        std::cout << "\trank " << rank << ": SYMGS \t\t\t\tverification failed" << std::endl;
        testck_data.count_fail++;
    }
    else {
        std::cout << "\trank " << rank << ": SYMGS \t\t\t\tverification passed" << std::endl;
        testck_data.count_pass++;
    }
    queue.wait();

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // TEST_SYMGS


#ifdef TEST_SYMGS_MV

    // reset y,z to zero
    //ZeroVector(y, queue, {}).wait();
    //ZeroVector(z, queue, {}).wait();
    oneapi::mkl::rng::generate(distribution2, engine, nRows, yv, {}).wait();
    queue.memcpy(zv, yv, nRows*sizeof(double)).wait();

    //
    // Test SYMGS_MV full permutation
    //
    ev_test = run_SYMGS_MV_custom(queue, A, optData, sparseM, w, y, Ay, {});
    ev_ref = run_SYMGS_MV_onemkl(queue, A, optData, hMatrixA, hMatrixB, w, z, Az, {ev_test});

//    check_and_print_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    check_arrays(queue, nRows, zv, yv, tmp_dev, tmp_host, {ev_ref});
    if (tmp_host[0] == 1) {
        std::cout << "\trank " << rank << ": SYMGS_MV y == z \t\tverification failed" << std::endl;
        testck_data.count_fail++;
    }
    else {
        std::cout << "\trank " << rank << ": SYMGS_MV y == z \t\tverification passed" << std::endl;
        testck_data.count_pass++;
    }
    queue.wait();

//    check_and_print_arrays(queue, nRows, Azv, Ayv, tmp_dev, tmp_host, {ev_ref});
    check_arrays(queue, nRows, Azv, Ayv, tmp_dev, tmp_host, {ev_ref});
    if (tmp_host[0] == 1) {
        std::cout << "\trank " << rank << ": SYMGS_MV Ay == Az \t\tverification failed" << std::endl;
        testck_data.count_fail++;
    }
    else {
        std::cout << "\trank " << rank << ": SYMGS_MV Ay == Az \t\tverification passed" << std::endl;
        testck_data.count_pass++;
    }
    queue.wait();

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // TEST_SYMGS_MV


#endif // TEST_FUNCTIONAL


#ifdef TEST_PERFORMANCE

    // ============================================================================================================
    //  Custom Kernel Performance suites
    // ============================================================================================================


    double n_flops = 0;
    double n_kflops = 0;
    double n_mflops = 0;
    double GFLOPS = 1e-9;
    double MFLOPS = 1e-6;
    double KFLOPS = 1e-3;

    auto convert_to_gflops = [=](double n_flops, double ave_time_in_seconds) {
        double ticks = n_flops / ave_time_in_seconds;
//        return ticks * 1e-9;
        return fma(0x1.12e0be826d695p-30, ticks, fma(-0x1.34674bfabb83bp-84, ticks, 0)); // divide more accurately by 1e9
    };

    if (rank == 0) {
        std::cout << "Starting Performance Suites: reporting ave_time (sec) for " << TIMED_RUNS << " runs after " << WARMUP_RUNS << " warmup runs" << std::endl;
    }

    //
    // ----------------------------   DOT performance ---------------------------------------------
    //

#ifdef TEST_DOT
    //
    // Test Dot custom
    //

    // n_flops for Dot (2 * nRows - 1 flops)
    n_flops = 2 * nRows - 1;

    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = ComputeDotProductLocal(nRows, r, w, fp_dev, queue, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = ComputeDotProductLocal(nRows, r, w, fp_dev, queue, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;
//    gflops =  n_flops * GFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": custom::Dot:        \t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif
#endif // TEST_DOT

    //
    // ----------------------------   SpGEMV performance ---------------------------------------------
    //

#ifdef TEST_SPGEMV

    //
    // Test SpGEMV on A + B custom
    //

    // n_flops for SpGEMV
    n_flops = 2 * nnz;

    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = custom::SpGEMV(queue, sparseM, wv, yv, {ev_run});
        //ev_run = ComputeSPMV(A, w, y, queue, ierr, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = custom::SpGEMV(queue, sparseM, wv, yv, {ev_run});
        //ev_run = ComputeSPMV(A, w, y, queue, ierr, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;
//    gflops =  n_flops * GFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": custom::SpGEMV A+B: \t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#ifdef RUN_ONEMKL_PERF_IF_AVAILABLE
    //
    // Test oneapi::mkl::sparse::gemv() on A custom
    //
    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = sparse::gemv(queue, oneapi::mkl::transpose::nontrans, 1.0,
                               hMatrixA, wv, 0.0, zv, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = sparse::gemv(queue, oneapi::mkl::transpose::nontrans, 1.0,
                               hMatrixA, wv, 0.0, zv, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;
    //gflops = static_cast<double>(n_kflops) * MFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": sparse::gemv A: \t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // RUN_ONEMKL_PERF_IF_AVAILABLE
#endif // TEST_SPGEMV

    //
    // ----------------------------   SpGEMV+DOT performance ---------------------------------------------
    //

#ifdef TEST_SPGEMV_DOT

    //
    // Test SpGEMV Dot custom
    //

    // n_flops for SpGEMV + Dot (2 * nRows - 1 flops)
    n_flops = 2 * nnz + 2 * nRows - 1;

    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = ComputeSPMV_DOT(A, w, y, fp_dev, queue, ierr, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = ComputeSPMV_DOT(A, w, y, fp_dev, queue, ierr, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;
//    gflops =  n_flops * GFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": custom::SpGEMV Dot: \t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif
#endif // TEST_SPGEMV_DOT

    //
    // ----------------------------   SpTRMV performance ---------------------------------------------
    //


#ifdef TEST_SPTRMVL
    //
    // Test custom::SpTRMV() lower_update
    //

    // n_flops for trmv lower diagonal
    //n_kflops = 2 * ((nnz - nRows) / 2 + nRows) / 1e3;
    n_flops = 2 * ((nnz - nRows) / 2);

    ZeroVector(y, queue, {}).wait();

    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = custom::SpTRMV(queue, sparseM, custom::uplo::lower_update, wv, rv, yv, y1v, {ev_run});
    }
    ev_run.wait();

    ZeroVector(y, queue, {}).wait();

    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = custom::SpTRMV(queue, sparseM, custom::uplo::lower_update, wv, rv, yv, y1v, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;

    //gflops = static_cast<double>(n_kflops) * MFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": custom::SpTRMV L+B update: \t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // TEST_SPTRMVL



#ifdef TEST_SPTRMVU
    //
    // Test custom::SpTRMV() upper_nonlocal
    //

    // n_flops for trmv upper no diagonal
    //n_kflops = (2 * ((nnz - nRows) / 2 ) + nRows) / 1e3;
    n_flops = (2 * ((nnz - nRows) / 2 ) + nRows);


    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = custom::SpTRMV(queue, sparseM, custom::uplo::upper_nonlocal, wv, rv, yv, y1v, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = custom::SpTRMV(queue, sparseM, custom::uplo::upper_nonlocal, wv, rv, yv, y1v, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;

    //gflops = static_cast<double>(n_kflops) * MFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": custom::SpTRMV r-(U+B):\t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif


#endif // TEST_SPTRMVU



    //
    // ----------------------------   SpTRSV performance ---------------------------------------------
    //

    // n_flops for trmv
    //n_kflops = 2 * ((nnz - nRows) / 2 + nRows) / 1e3;
    n_flops = 2.0 * ((nnz - nRows) / 2.0 + nRows);


#ifdef TEST_SPTRSVL
    //
    // Test SpTRSV Lower_diagonal on A
    //
    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = custom::SpTRSV(queue, sparseM, custom::uplo::lower_diagonal, wv, yv, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = custom::SpTRSV(queue, sparseM, custom::uplo::lower_diagonal, wv, yv, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;

    //gflops = n_kflops * MFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": custom::SpTRSV FWD: \t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif



#ifdef RUN_ONEMKL_PERF_IF_AVAILABLE
    //
    // Test oneapi::mkl::sparse::trsv() lower
    //

    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = sparse::trsv(queue, oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit,
                              hMatrixA, wv, zv, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = sparse::trsv(queue, oneapi::mkl::uplo::lower, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit,
                              hMatrixA, wv, zv, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;

    gflops = n_kflops * MFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": sparse::trsv FWD: \t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // RUN_ONEMKL_PERF_IF_AVAILABLE
#endif // TEST_SPTRSVL



#ifdef TEST_SPTRSVU
    //
    // Test SpTRSV Upper_diagonal
    //
    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = custom::SpTRSV(queue, sparseM, custom::uplo::upper_diagonal, wv, yv, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = custom::SpTRSV(queue, sparseM, custom::uplo::upper_diagonal, wv, yv, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;

    gflops = n_kflops * MFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": custom::SpTRSV BWD: \t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif


#ifdef RUN_ONEMKL_PERF_IF_AVAILABLE
    //
    // Test oneapi::mkl::sparse::trsv() upper
    //

    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = sparse::trsv(queue, oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit,
                              hMatrixA, wv, zv, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = sparse::trsv(queue, oneapi::mkl::uplo::upper, oneapi::mkl::transpose::nontrans, oneapi::mkl::diag::nonunit,
                              hMatrixA, wv, zv, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;

    gflops = n_kflops * MFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": sparse::trsv BWD: \t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // RUN_ONEMKL_PERF_IF_AVAILABLE
#endif // TEST_SPTRSVU

    //
    // ----------------------------   SYMGS performance ---------------------------------------------
    //


    // n_flops for SYMGS
    // optimized n_flops
    // n_flops = (3 * nnz + 5 * nRows);
    // reference n_flops
    n_flops = 4 * nnz + 2 * nRows;

#ifdef TEST_SYMGS
    //
    // Test SYMGS with custom kernels
    //
    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = run_SYMGS_custom(queue, A, optData, sparseM, w, y, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = run_SYMGS_custom(queue, A, optData, sparseM, w, y, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;

//    gflops = n_kflops * MFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": custom SYMGS: \t\t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif


#ifdef RUN_ONEMKL_PERF_IF_AVAILABLE
    //
    // Test SYMGS with oneMKL kernels
    //

    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = run_SYMGS_onemkl(queue, A, optData, hMatrixA, hMatrixB, w, z, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = run_SYMGS_onemkl(queue, A, optData, hMatrixA, hMatrixB, w, z, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;

//    gflops = n_kflops * MFLOPS  / ave_time;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": oneMKL SYMGS: \t\t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // RUN_ONEMKL_PERF_IF_AVAILABLE
#endif // TEST_SYMGS

    //
    // ----------------------------   SYMGS_MV performance ---------------------------------------------
    //

    // n_flops for SYMGS_MV
    // optimized n_flops
//    n_flops = (3 * nnz + 6 * nRows);
    // reference n_flops
    n_flops = 6 * nnz + 2 * nRows;

#ifdef TEST_SYMGS_MV
    //
    // Test SYMGS_MV with PTAP using custom kernels
    //
    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = run_SYMGS_MV_custom(queue, A, optData, sparseM, w, y, Ay, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = run_SYMGS_MV_custom(queue, A, optData, sparseM, w, y, Ay, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": custom SYMGS_MV: \t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#ifdef RUN_ONEMKL_PERF_IF_AVAILABLE
    //
    // Test SYMGS_MV with oneMKL kernels
    //

    ev_run = sycl::event();
    for (int run = 0; run < WARMUP_RUNS; ++run) {
        ev_run = run_SYMGS_MV_onemkl(queue, A, optData, hMatrixA, hMatrixB, w, z, Az, {ev_run});
    }
    ev_run.wait();


    start_time = mytimer();
    for (int run = 0; run < TIMED_RUNS; ++run) {
        ev_run = run_SYMGS_MV_onemkl(queue, A, optData, hMatrixA, hMatrixB, w, z, Az, {ev_run});
    }
    ev_run.wait();
    wall_time = mytimer() - start_time;
    ave_time = wall_time / TIMED_RUNS;
    gflops = convert_to_gflops(n_flops, ave_time);
    std::cout << "\trank " << rank << ": oneMKL SYMGS_MV: \t\t" << ave_time << ", GFLOPS = " << gflops << std::endl;

#ifndef HPCG_NO_MPI
    MPI_Barrier(MPI_COMM_WORLD);
#endif

#endif // RUN_ONEMKL_PERF_IF_AVAILABLE
#endif // TEST_SYMGS_MV

#endif // TEST_PERFORMANCE

    // cleanup
    queue.wait();
    DeleteVector(w, queue);
    DeleteVector(y, queue);
    DeleteVector(z, queue);
    DeleteVector(Ay, queue);
    DeleteVector(Az, queue);

    sycl::free(fp_dev, queue);
    sycl::free(fp_host, queue);
    sycl::free(tmp_dev, queue);
    sycl::free(tmp_host, queue);
    sycl::free(tmp2_dev, queue);

#if defined(TEST_FUNCTIONAL) || ( defined(TEST_PERFORMANCE) && defined(RUN_ONEMKL_PERF_IF_AVAILABLE))
    sparse::release_matrix_handle(queue, &hMatrixA, {}).wait();
    sparse::release_matrix_handle(queue, &hMatrixB, {}).wait();
#endif  //defined(TEST_FUNCTIONAL) || ( defined(TEST_PERFORMANCE) && defined(RUN_ONEMKL_PERF_IF_AVAILABLE))


    return 0;
}

#else

int TestCustomKernels(SparseMatrix &A, Vector &b, Vector &x, int rank, TestCustomKernelsData &testck_data, sycl::queue &queue)
{
    return 0;
}

#endif // HPCG_USE_CUSTOM_KERNELS HPCG_TEST_CUSTOM_KERNELS
