// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_b_preshuffle.hpp"

#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"

namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {

using F8   = f8_t;
using BF16 = bhalf_t;
using F32  = float;

using Row = tensor_layout::gemm::RowMajor;
using Col = tensor_layout::gemm::ColumnMajor;

template <index_t... Is>
using S = Sequence<Is...>;

using PassThrough = element_wise::PassThrough;

static constexpr auto GemmDefault    = GemmSpecialization::Default;
static constexpr auto GemmKPadding   = GemmSpecialization::KPadding;
static constexpr auto GemmMNPadding  = GemmSpecialization::MNPadding;
static constexpr auto GemmNKPadding  = GemmSpecialization::NKPadding;
static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;

static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_instances = std::tuple<
    // no valid instances available
    >;
template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma32x32_mn_compute_instances =
    std::tuple<
        // clang-format off
    //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
    //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
    //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8,  BF16,  F32,    BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
        // clang-format on
        >;
template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p1_instances = std::tuple<
    // clang-format off
        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
        // Commented out instances are invalid. MRepeat < 4
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   128,  16,  16,          16,   16,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    ////DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    128,   128,  16,  16,          16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    ////DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    128,   128,  16,  16,          16,   16,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    256,   128,  16,  16,          16,   16,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    ////DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    256,   128,  16,  16,          16,   16,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<           Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    64,    512,   128,  16,  16,          16,   16,    4,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
    /////DeviceGemm_Xdl_CShuffleV3_BPreshuffle<          Row,     Col,       Row,    F8,    F8, BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,    32,    512,   128,  16,  16,          16,   16,    2,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,              0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
    // clang-format on
    >;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p2_instances = std::tuple<
    // Commented out instances do not work because MRepeat < 4
    // clang-format off
        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,    64,    128,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,    32,    128,   256,  16,  16,  16,   16,    2,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,    32,    128,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,    64,    128,   512,  16,  16,  16,   16,    4,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,    32,    128,   512,  16,  16,  16,   16,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
    // clang-format on
    >;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p3_instances = std::tuple<
    // clang-format off
        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
        // N 256
//      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                           256,    64,    256,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
//      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                         256,    32,    256,   256,  16,  16,  16,   16,    2,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                           256,    64,    256,   512,  16,  16,  16,   16,    4,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
//      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                         256,    32,    256,   512,  16,  16,  16,   16,    2,    4,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                           256,    64,    512,   256,  16,  16,  16,   16,    4,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
//      //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,       Row,    F8,    F8,       BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                         256,    32,    512,   256,  16,  16,  16,   16,    2,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
    // clang-format on
    >;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p4_instances = std::tuple<
    // None of the instacnes have MRepeat >= 4
    // clang-format off
        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               16,                         64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               16,                        128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               16,                        256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  128,               32,                         16,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 4>,     4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  128,               16,                         32,   128,  16,  16,  16,   16,    1,    1,     S<8, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  128,               16,                         32,   512,  16,  16,  16,   16,    1,    1,     S<32, 4, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,     4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               16,                         64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               32,                         64,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               64,                         64,   512,  16,  16,  16,   16,    2,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               64,                         16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                  256,               64,                         16,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
    // clang-format on
    >;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_p5_instances = std::tuple<
    // MRepeat < 1, invalid instances
    // clang-format off
        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<          Row,     Col,   Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,                                         GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,        16,   16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,         2,             16,             16,          0,          1,           1,          S<1, 16, 1, 16>,    4, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<          Row,     Col,   Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,                                         GemmSpec,   256,     16,   128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,        16,   16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,         2,             16,             16,          0,          1,           2,          S<1, 16, 1, 16>,    8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<          Row,     Col,   Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,                                         GemmSpec,   256,     16,   256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,        16,   16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,         2,             16,             16,          0,          1,           2,          S<1, 16, 1, 16>,    8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        //DeviceGemm_Xdl_CShuffleV3_BPreshuffle<          Row,     Col,   Row,    F8,    F8,     BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,                                         GemmSpec,   256,     16,   512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,        16,   16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,         2,             16,             16,          0,          1,           2,          S<1, 16, 1, 16>,    8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>

    // clang-format on
    >;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p1 = std::tuple<
    // clang-format off
        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,                                           S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,                                           S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,                                           S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,                                           S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,                                           S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
    // clang-format on
    >;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma_mn_compute_instances_p2 = std::tuple<
    // clang-format off
        //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
        //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
        //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     256,    128,   128,  16,  16,  16,   16,    8,    4,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     224,    128,   128,  16,  16,  16,   16,    7,    4,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     192,    128,   128,  16,  16,  16,   16,    6,    4,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     160,    128,   128,  16,  16,  16,   16,    5,    4,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     160,    128,   128,  16,  16,  16,   16,    10,   2,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,      Row,    F8,    F8,    BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,                                             256,     128,    128,   128,  16,  16,  16,   16,    4,    4,   S<8, 32, 1>,    S<1, 0, 2>,       S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>, 8  ,BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
    // clang-format on
    >;
template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part1 =
    std::tuple<
        // clang-format off
    //##########################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
    //##########################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
    //##########################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
    //##########################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle< Row, Col, Row, F8, F8,           BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec,       256,              256,     256,    128, 16, 16, 16, 16, 8, 8,              S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                    2, 16, 16, 0,                       S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                       2, 16, 16, 0, 1, 2,                                            S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle< Row, Col, Row, F8, F8,           BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec,       256,              224,     256,    128, 16, 16, 16, 16, 7, 8,              S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                    2, 16, 16, 0,                       S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                       2, 16, 16, 0, 1, 2,                                            S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle< Row, Col, Row, F8, F8,           BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec,       256,              192,     256,    128, 16, 16, 16, 16, 6, 8,              S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                    2, 16, 16, 0,                       S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                       2, 16, 16, 0, 1, 2,                                            S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle< Row, Col, Row, F8, F8,           BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec,       256,              160,     256,    128, 16, 16, 16, 16, 5, 8,              S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                    2, 16, 16, 0,                       S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                       2, 16, 16, 0, 1, 2,                                            S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle< Row, Col, Row, F8, F8,           BF16, F32, BF16, PassThrough, PassThrough, PassThrough, GemmSpec,       256,              128,     256,    128, 16, 16, 16, 16, 4, 8,              S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                    2, 16, 16, 0,                       S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,                       2, 16, 16, 0, 1, 2,                                            S<1, 32, 1, 8>, 8, BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
        // clang-format on
        >;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part2 =
    std::tuple<
        // clang-format off
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,   14,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    192,   128,  16,  16,  16,   16,   14,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,   14,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,     96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,  Row,    F8,    F8, BF16,  F32,     BF16,    PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,   14,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
        // clang-format on
        >;
template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part3 =
    std::tuple<
        // clang-format off
        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,   12,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    224,   128,  16,  16,  16,   16,    6,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    192,   128,  16,  16,  16,   16,   12,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    160,   128,  16,  16,  16,   16,    6,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,   12,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,     96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,   Row,    F8,    F8,   BF16,  F32,     BF16,     PassThrough, PassThrough, PassThrough,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,   12,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,    8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
        //clang-format on
        >;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part4 = std::tuple<
        // clang-format off
        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |        
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,                                        GemmSpec,   256,   160,   256,  128,  16,  16,  16,   16,    10,    4,           S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,               16,                 16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,                                        GemmSpec,   256,   160,   192,  128,  16,  16,  16,   16,    10,    3,           S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,               16,                 16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,                                        GemmSpec,   256,   160,   128,  128,  16,  16,  16,   16,    10,    2,           S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,               16,                 16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
        DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,                                        GemmSpec,   256,   160,    64,  128,  16,  16,  16,   16,    10,    1,           S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,               16,                 16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
        // clang-format on
        >;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_f8_f8_bf16_mk_mfma16x16_nk_mn_comp_instances_part5 =
    std::tuple<
        // clang-format off
    //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
    //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|                _MBlock_MPerBlock| ScalarPerVector|                               Pipeline|                     Pipeline|
    //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                _NBlock_NPerBlock|   _NWaveNPerXdl|                              Scheduler|                     Verision|
    //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   96,   128,  16,  16,  16,   16,    4,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   64,   128,  16,  16,  16,   16,    8,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,  256,  16,  16,  16,   16,    8,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,            16,          0,          2,            1,                   S<1, 64, 1, 4>,               8,   BlockGemmPipelineScheduler::Intrawave,   BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   64,   256,  16,  16,  16,   16,    8,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
        // clang-format on
        >;

template <GemmSpecialization GemmSpec>
using device_gemm_xdl_universal_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances_part6 =
    std::tuple<
// clang-format off
    //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer|             AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
    //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|                |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
    //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |                |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
    //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |                |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   192,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
    DeviceGemm_Xdl_CShuffleV3_BPreshuffle<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8,32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
#endif
        // clang-format on
        >;

} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
