IREE Dialects

iree_codegen dialect

class iree.compiler.dialects.iree_codegen.AttentionOpDetail

property batch_dims: (self) -> list[int]

property domain_rank: (self) -> int

property k1_dims: (self) -> list[int]

property k2_dims: (self) -> list[int]

property m_dims: (self) -> list[int]

property n_dims: (self) -> list[int]

class iree.compiler.dialects.iree_codegen.CompilationInfoAttr(*args, **kwargs)

property lowering_config

property translation_info

enum iree.compiler.dialects.iree_codegen.DenormalFpMath(value)

Denormal mode for fp math

Member Type: int

Valid values are as follows:

None_ = <DenormalFpMath.None_: 0>

PreserveSign = <DenormalFpMath.PreserveSign: 1>

PositiveZero = <DenormalFpMath.PositiveZero: 2>

enum iree.compiler.dialects.iree_codegen.DispatchLoweringPassPipeline(value)

identifier for pass pipeline use to lower dispatch region

Member Type: int

Valid values are as follows:

CPUDefault = <DispatchLoweringPassPipeline.CPUDefault: 0>

CPUDoubleTilingExpert = <DispatchLoweringPassPipeline.CPUDoubleTilingExpert: 1>

CPUConvTileAndDecomposeExpert = <DispatchLoweringPassPipeline.CPUConvTileAndDecomposeExpert: 2>

Mmt4dTilingExpert = <DispatchLoweringPassPipeline.Mmt4dTilingExpert: 3>

CPUBufferOpsTileAndVectorize = <DispatchLoweringPassPipeline.CPUBufferOpsTileAndVectorize: 4>

CPUDataTiling = <DispatchLoweringPassPipeline.CPUDataTiling: 5>

CPULinalgExtTileAndVectorize = <DispatchLoweringPassPipeline.CPULinalgExtTileAndVectorize: 6>

LLVMGPUDefault = <DispatchLoweringPassPipeline.LLVMGPUDefault: 100>

LLVMGPUBaseLowering = <DispatchLoweringPassPipeline.LLVMGPUBaseLowering: 101>

LLVMGPUDistribute = <DispatchLoweringPassPipeline.LLVMGPUDistribute: 102>

LLVMGPUVectorize = <DispatchLoweringPassPipeline.LLVMGPUVectorize: 103>

LLVMGPUVectorDistribute = <DispatchLoweringPassPipeline.LLVMGPUVectorDistribute: 104>

LLVMGPUWinogradVectorize = <DispatchLoweringPassPipeline.LLVMGPUWinogradVectorize: 105>

LLVMGPUTileAndFuse = <DispatchLoweringPassPipeline.LLVMGPUTileAndFuse: 106>

SPIRVBaseLowering = <DispatchLoweringPassPipeline.SPIRVBaseLowering: 200>

SPIRVBaseDistribute = <DispatchLoweringPassPipeline.SPIRVBaseDistribute: 201>

SPIRVBaseVectorize = <DispatchLoweringPassPipeline.SPIRVBaseVectorize: 202>

SPIRVSubgroupReduce = <DispatchLoweringPassPipeline.SPIRVSubgroupReduce: 203>

SPIRVMatmulPromoteVectorize = <DispatchLoweringPassPipeline.SPIRVMatmulPromoteVectorize: 204>

SPIRVCooperativeMatrixVectorize = <DispatchLoweringPassPipeline.SPIRVCooperativeMatrixVectorize: 205>

SPIRVWinogradVectorize = <DispatchLoweringPassPipeline.SPIRVWinogradVectorize: 206>

VMVXDefault = <DispatchLoweringPassPipeline.VMVXDefault: 300>

TransformDialectCodegen = <DispatchLoweringPassPipeline.TransformDialectCodegen: 1000>

Custom = <DispatchLoweringPassPipeline.Custom: 1001>

None_ = <DispatchLoweringPassPipeline.None_: 65535>

class iree.compiler.dialects.iree_codegen.DispatchLoweringPassPipelineAttr(*args, **kwargs)

property raw_value

property value

class iree.compiler.dialects.iree_codegen.IGEMMGenericConvDetails

property conv_to_igemm_dim_map: (self) -> dict

property filter_reassoc_indices: (self) -> list[list[int]]

property igemm_contraction_maps: (self) -> iree.compiler.ir.Attribute

property igemm_loop_bounds: (self) -> list[int]

property igemm_loop_iterators: (self) -> iree.compiler.ir.Attribute

property im2col_output_perm: (self) -> list[int]

property is_output_channel_first: (self) -> bool

enum iree.compiler.dialects.iree_codegen.IntEnum(value)

Enum where members are also (and must be) ints

Member Type: int

flag iree.compiler.dialects.iree_codegen.IntFlag(value)

Support for integer-based Flags

Member Type: int

class iree.compiler.dialects.iree_codegen.ScaledContractionDimensions

property batch: (self) -> list[int]

property k: (self) -> list[int]

property kB: (self) -> list[int]

property m: (self) -> list[int]

property n: (self) -> list[int]

class iree.compiler.dialects.iree_codegen.TranslationInfoAttr(*args, **kwargs)

property codegen_spec

property configuration

property pass_pipeline

property subgroup_size

property workgroup_size

enum iree.compiler.dialects.iree_codegen.UKernelArgumentKind(value)

Attribute describing the ukernel integration point

Member Type: int

Valid values are as follows:

Tensor = <UKernelArgumentKind.Tensor: 0>

Memref = <UKernelArgumentKind.Memref: 1>

Bitcode = <UKernelArgumentKind.Bitcode: 2>

enum iree.compiler.dialects.iree_codegen.WorkgroupId(value)

Attribute that map to hal.workgrpoup.ids

Member Type: int

Valid values are as follows:

IdX = <WorkgroupId.IdX: 0>

IdY = <WorkgroupId.IdY: 1>

IdZ = <WorkgroupId.IdZ: 2>

class iree.compiler.dialects.iree_codegen.auto(value=_auto_null): Instances are replaced with an appropriate value in Enum class suites.

iree.compiler.dialects.iree_codegen.register_attribute_builder(kind, replace=False)

iree_gpu dialect

class iree.compiler.dialects.iree_gpu.BarrierRegionOp(results_, inputs, *, loc=None, ip=None)

This op is designed to represent synchronization of workers on the operands and results of the given region. This operation naturally arises when combining the regions of producer-consumer scf.forall operations that share a mapping type.

For example, consider the following pair of parallel loops. ```mlir

%0 = scf.forall (%idy, %idx) in (2, 32) shared_outs(%init = %empty) -> (tensor<4x128xf32>) {
%in = … %2 = affine.apply #affine_map<(d0) -> (d0 * 2)> (%idy) %3 = affine.apply #affine_map<(d0) -> (d0 * 4)> (%idx) scf.forall.in_parallel {

tensor.parallel_insert_slice %in into %init[%2, %3] [2, 4] [1, 1]
: tensor<2x4xf32> into tensor<4x128xf32>

}

} {mapping = [#gpu.thread<y>, #gpu.thread<x>]} %1 = scf.forall (%idy, %idx) in (8, 8) -> (tensor<128x128xf32>) {

%4 = affine.apply #affine_map<(d0) -> (d0 * 16)> (%idx) %extracted_slice = tensor.extract_slice %0[0, %4] [4, 16] [1, 1]

: tensor<4x128xf32> to tensor<4x16xf32>

…

} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}

```

Because these loops share the same worker type and total count, the bodies of these two loops can be merged with a barrier an insert_slice and a shuffle where the boundary of the loops currently is.

```mlir

%0 = scf.forall (%idy, %idx) in (8, 8) -> (tensor<4x128xf32>) {

%alloc = bufferization.alloc_tensor {memory_space = #gpu.address_space<workgroup>}: : tensor<4x128xf32>

%barrier = iree_gpu.barrier_region %alloc { ^bb0(%shared: tensor<4x128xf32>):

%ids = affine.delinearize_index %idy * 8 + %idx to (2, 32) : index %in = … %2 = affine.apply #affine_map<(d0) -> (d0 * 2)> (%ids#0) %3 = affine.apply #affine_map<(d0) -> (d0 * 4)> (%ids#1) %inserted_slice = tensor.insert_slice %in into %shared[%2, %3] [2, 4] [1, 1]

: tensor<2x4xf32> to tensor<4x128xf32>

iree_gpu.yield %slice : tensor<4x16xf32>

} : tensor<4x128xf32> -> tensor<4x16xf32> %4 = affine.apply #affine_map<(d0) -> (d0 * 16)> (%idx) %slice = tensor.extract_slice %barrier[0, %4] [4, 16] [1, 1] : tensor<4x128xf32> to tensor<4x16xf32> …

} {mapping = [#gpu.thread<y>, #gpu.thread<x>]}

```

A barrier_region can be lowered to two barriers, one on the input operands and a second one on the results.

Movtivation and Intended Use Cases:

The primary way this op is generated is when fusing parallel loops with tensor results. This operation helps to make lowerings more progressive and flexible.

Lowering directly to an alloc + reads and writes breaks the dependency chain making transformations like barrier placement and pipelining potentially more difficult.

Allows the option of non-vector based lowering paths.

OPERATION_NAME = 'iree_gpu.barrier_region'

property inputs: iree.compiler._mlir_libs._mlir.ir.OpOperandList

property region: iree.compiler._mlir_libs._mlir.ir.Region

property results_: iree.compiler._mlir_libs._mlir.ir.OpResultList

class iree.compiler.dialects.iree_gpu.BufferResourceCastOp(result, input, *, cache_swizzle_stride=None, loc=None, ip=None)

Nominal cast of a tensor to AMDGPU buffer resource memory space before bufferization. This op takes the parameters with which to perform the cast if |input| bufferizes to storage_buffer memory space. If |input| resolves to any other memory space this op is silently dropped and has no effect.

If |cache_swizzle_stride| is present, there is verification before bufferization that all producers of |input| are view-like and single source and user (i.e. trivially no alias). In all other cases this op is best effort and has no verification or failure modes.

// TODO: Add other parameters for casting as needed.

OPERATION_NAME = 'iree_gpu.buffer_resource_cast'

property cache_swizzle_stride: Optional[iree.compiler._mlir_libs._mlir.ir.Value[iree.compiler._mlir_libs._mlir.ir.IndexType]]

property input: iree.compiler._mlir_libs._mlir.ir.Value[iree.compiler._mlir_libs._mlir.ir.RankedTensorType]

property result: iree.compiler._mlir_libs._mlir.ir.OpResult[iree.compiler._mlir_libs._mlir.ir.RankedTensorType]: Shortcut to get an op result if it has only one (throws an error otherwise).

class iree.compiler.dialects.iree_gpu.CoalescedGatherDMAOp(result, source, indices, init, lane, *, loc=None, ip=None)

Performs a coalesced gather operation. This operation can exist in two forms: a tensor-based (value-semantic) form and a buffer-based (memref-semantic) form.

In both forms, it reads elements from a source operand based on the optional indices operand and writes the gathered data into the destination init operand using destination-passing style.

The indices operand is optional. The source represents the data loaded by this thread, while init is the collective output for all threads in the subgroup. Therefore, source and init may have different shapes (typically source is smaller, representing one thread’s portion).

The operation is specifically designed for subgroup-level parallelism, where threads within a subgroup cooperatively gather data with coalesced memory accesses. It implements ParallelCombiningOpInterface and must live inside an op implementing InParallelOpInterface, such as scf.forall.in_parallel.

## Lowering Paths

Two lowering strategies are supported: 1. Lowers to amdgpu.gather_to_lds operations when lowering

requirements are met.

Default lowering using vector.gather operations.

## Operands and Results

$indices: The variadic indices operand is an optional tensor or vector of indices to gather from source. If the indices are present, their shape must be a prefix of the init/result type. Each element of indices must be a 1D tensor or vector whose length matches the length of the corresponding dimension of source.

The values in indices form indices into the memref starting at source from which a given thread will gather data, and each tensor/vector component in indices corresponds to one index dimension in source. Any component that is not specified is implicitly assumed to be [0, 1, …, len - 1] where len is the length of the corresponding dimension of source. That is, gather all the elements along that dimension.

This operation will gather data into its result as by setting: ``` forall (i0, i1, … iN) in (dim(source, 0), dim(source, 1), …

dim(source, N)):

result[i0, i1, … iN-1, iN + lane_id * dim(init, N)] =
source[indices[0][i0], indices[1][i1], …, indices[N][iN]]

``` where lane_id is the ID of the thread within its subgroup.

Note that, in order to enable efficient gathers, the trailing dimension of source must have unspecified indices and the dim’s size must be a supported DMA width for your target.

$indices supports both index and i32 element types. The reason is that one lowering path (from linalg_ext.gather) already have indices in i32 type.
$source: Source tensor/memref containing the data to be gathered.
$init: Destination tensor/memref receiving the gathered data (destination-passing style).
lane: The lane that specifies the coalescing store’s offset within the workgroup/shared memory.

## Example of a single subgroup using coalesced_gather_dma in copy mode: for transferring tensor<4x128xf32>, with an intended DMA width of 128 bits (4 x f32), with subgroup size 32:

```mlir scf.forall (%arg6) in (32) … {

%2 = %arg6 * 4 %thread_slice = … : tensor<4x128xf32> %dest_slice = … : tensor<4x128xf32> scf.forall.in_parallel {

iree_gpu.coalesced_gather_dma %thread_slice into %dest_slice lane(%arg6) : …

}

} {mapping = [#gpu.lane_id<linear_dim_0>]}

```

OPERATION_NAME = 'iree_gpu.coalesced_gather_dma'

property indices: iree.compiler._mlir_libs._mlir.ir.OpOperandList

property init: iree.compiler._mlir_libs._mlir.ir.Value

property lane: iree.compiler._mlir_libs._mlir.ir.Value[iree.compiler._mlir_libs._mlir.ir.IndexType]

property result: Optional[iree.compiler._mlir_libs._mlir.ir.OpResult]: Shortcut to get an op result if it has only one (throws an error otherwise).

property source: iree.compiler._mlir_libs._mlir.ir.Value

flag iree.compiler.dialects.iree_gpu.ComputeBitwidths(value)

Supported bitwidths for compute

Member Type: int

Valid values are as follows:

FP64 = <ComputeBitwidths.FP64: 1>

FP32 = <ComputeBitwidths.FP32: 2>

FP16 = <ComputeBitwidths.FP16: 4>

Int64 = <ComputeBitwidths.Int64: 8>

Int32 = <ComputeBitwidths.Int32: 16>

Int16 = <ComputeBitwidths.Int16: 32>

Int8 = <ComputeBitwidths.Int8: 64>

FP8 = <ComputeBitwidths.FP8: 128>

FP6 = <ComputeBitwidths.FP6: 256>

FP4 = <ComputeBitwidths.FP4: 512>

flag iree.compiler.dialects.iree_gpu.DotProductOps(value)

Supported dot product ops

Member Type: int

Valid values are as follows:

DP4xI8ToI32 = <DotProductOps.DP4xI8ToI32: 1>

class iree.compiler.dialects.iree_gpu.GPUMMASingleSubgroupLayout

property element: (self) -> list[int]

property outer: (self) -> list[int]

property thread: (self) -> list[int]

property tstrides: (self) -> list[int]

enum iree.compiler.dialects.iree_gpu.IntEnum(value)

Enum where members are also (and must be) ints

Member Type: int

flag iree.compiler.dialects.iree_gpu.IntFlag(value)

Support for integer-based Flags

Member Type: int

enum iree.compiler.dialects.iree_gpu.IteratorType(value)

Iterator type

Member Type: int

Valid values are as follows:

parallel = <IteratorType.parallel: 0>

reduction = <IteratorType.reduction: 1>

class iree.compiler.dialects.iree_gpu.LoweringConfigAttr(*args, **kwargs)

property attributes

property mma_kind

property reduction_tile_sizes

property subgroup_basis

property workgroup_tile_sizes

class iree.compiler.dialects.iree_gpu.MMAAttr(*args, **kwargs)

property abc_element_types

property abc_vector_types

get_virtual_intrinsics: Returns a list of virtual intrinsics associated with this MMAAttr.

property mnk_shape

enum iree.compiler.dialects.iree_gpu.MMAIntrinsic(value)

Descriptor for different MMA intrinsics

Member Type: int

Valid values are as follows:

MFMA_F32_16x16x4_F32 = <MMAIntrinsic.MFMA_F32_16x16x4_F32: 4112>

MFMA_F32_16x16x16_F16 = <MMAIntrinsic.MFMA_F32_16x16x16_F16: 4128>

MFMA_F32_32x32x8_F16 = <MMAIntrinsic.MFMA_F32_32x32x8_F16: 4129>

MFMA_I32_16x16x16_I8 = <MMAIntrinsic.MFMA_I32_16x16x16_I8: 4288>

MFMA_I32_32x32x8_I8 = <MMAIntrinsic.MFMA_I32_32x32x8_I8: 4289>

MFMA_F32_16x16x8_BF16 = <MMAIntrinsic.MFMA_F32_16x16x8_BF16: 4384>

MFMA_F32_32x32x4_BF16 = <MMAIntrinsic.MFMA_F32_32x32x4_BF16: 4385>

MFMA_F64_16x16x4_F64 = <MMAIntrinsic.MFMA_F64_16x16x4_F64: 4352>

MFMA_F32_16x16x16_BF16 = <MMAIntrinsic.MFMA_F32_16x16x16_BF16: 4640>

MFMA_F32_32x32x8_BF16 = <MMAIntrinsic.MFMA_F32_32x32x8_BF16: 4641>

MFMA_F32_16x16x32_F8E5M2FNUZ = <MMAIntrinsic.MFMA_F32_16x16x32_F8E5M2FNUZ: 4656>

MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ = <MMAIntrinsic.MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ: 4657>

MFMA_F32_16x16x32_F8E4M3FNUZ = <MMAIntrinsic.MFMA_F32_16x16x32_F8E4M3FNUZ: 4658>

MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ = <MMAIntrinsic.MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ: 4659>

MFMA_F32_32x32x16_F8E5M2FNUZ = <MMAIntrinsic.MFMA_F32_32x32x16_F8E5M2FNUZ: 4660>

MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ = <MMAIntrinsic.MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ: 4661>

MFMA_F32_32x32x16_F8E4M3FNUZ = <MMAIntrinsic.MFMA_F32_32x32x16_F8E4M3FNUZ: 4662>

MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ = <MMAIntrinsic.MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ: 4663>

MFMA_I32_16x16x32_I8 = <MMAIntrinsic.MFMA_I32_16x16x32_I8: 4800>

MFMA_I32_32x32x16_I8 = <MMAIntrinsic.MFMA_I32_32x32x16_I8: 4801>

MFMA_F32_16x16x32_F16 = <MMAIntrinsic.MFMA_F32_16x16x32_F16: 4896>

MFMA_F32_32x32x16_F16 = <MMAIntrinsic.MFMA_F32_32x32x16_F16: 4897>

MFMA_F32_16x16x32_BF16 = <MMAIntrinsic.MFMA_F32_16x16x32_BF16: 4898>

MFMA_F32_32x32x16_BF16 = <MMAIntrinsic.MFMA_F32_32x32x16_BF16: 4899>

MFMA_F32_16x16x32_F8E5M2 = <MMAIntrinsic.MFMA_F32_16x16x32_F8E5M2: 4912>

MFMA_F32_16x16x32_F8E5M2_F8E4M3FN = <MMAIntrinsic.MFMA_F32_16x16x32_F8E5M2_F8E4M3FN: 4913>

MFMA_F32_16x16x32_F8E4M3FN = <MMAIntrinsic.MFMA_F32_16x16x32_F8E4M3FN: 4914>

MFMA_F32_16x16x32_F8E4M3FN_F8E5M2 = <MMAIntrinsic.MFMA_F32_16x16x32_F8E4M3FN_F8E5M2: 4915>

MFMA_F32_32x32x16_F8E5M2 = <MMAIntrinsic.MFMA_F32_32x32x16_F8E5M2: 4916>

MFMA_F32_32x32x16_F8E5M2_F8E4M3FN = <MMAIntrinsic.MFMA_F32_32x32x16_F8E5M2_F8E4M3FN: 4917>

MFMA_F32_32x32x16_F8E4M3FN = <MMAIntrinsic.MFMA_F32_32x32x16_F8E4M3FN: 4918>

MFMA_F32_32x32x16_F8E4M3FN_F8E5M2 = <MMAIntrinsic.MFMA_F32_32x32x16_F8E4M3FN_F8E5M2: 4919>

MFMA_F32_16x16x128_F8E5M2 = <MMAIntrinsic.MFMA_F32_16x16x128_F8E5M2: 4920>

MFMA_F32_16x16x128_F8E5M2_F8E4M3FN = <MMAIntrinsic.MFMA_F32_16x16x128_F8E5M2_F8E4M3FN: 4921>

MFMA_F32_16x16x128_F8E4M3FN = <MMAIntrinsic.MFMA_F32_16x16x128_F8E4M3FN: 4922>

MFMA_F32_16x16x128_F8E4M3FN_F8E5M2 = <MMAIntrinsic.MFMA_F32_16x16x128_F8E4M3FN_F8E5M2: 4923>

MFMA_F32_32x32x64_F8E5M2 = <MMAIntrinsic.MFMA_F32_32x32x64_F8E5M2: 4924>

MFMA_F32_32x32x64_F8E5M2_F8E4M3FN = <MMAIntrinsic.MFMA_F32_32x32x64_F8E5M2_F8E4M3FN: 4925>

MFMA_F32_32x32x64_F8E4M3FN = <MMAIntrinsic.MFMA_F32_32x32x64_F8E4M3FN: 4926>

MFMA_F32_32x32x64_F8E4M3FN_F8E5M2 = <MMAIntrinsic.MFMA_F32_32x32x64_F8E4M3FN_F8E5M2: 4927>

MFMA_I32_16x16x64_I8 = <MMAIntrinsic.MFMA_I32_16x16x64_I8: 5056>

MFMA_I32_32x32x32_I8 = <MMAIntrinsic.MFMA_I32_32x32x32_I8: 5057>

WMMAR3_F32_16x16x16_F16 = <MMAIntrinsic.WMMAR3_F32_16x16x16_F16: 6176>

WMMAR3_F16_16x16x16_F16 = <MMAIntrinsic.WMMAR3_F16_16x16x16_F16: 6177>

WMMAR3_F32_16x16x16_BF16 = <MMAIntrinsic.WMMAR3_F32_16x16x16_BF16: 6178>

WMMAR3_BF16_16x16x16_BF16 = <MMAIntrinsic.WMMAR3_BF16_16x16x16_BF16: 6179>

WMMAR3_I32_16x16x16_I8 = <MMAIntrinsic.WMMAR3_I32_16x16x16_I8: 6336>

WMMAR4_F32_16x16x16_F16 = <MMAIntrinsic.WMMAR4_F32_16x16x16_F16: 6432>

WMMAR4_F16_16x16x16_F16 = <MMAIntrinsic.WMMAR4_F16_16x16x16_F16: 6433>

WMMAR4_F32_16x16x16_BF16 = <MMAIntrinsic.WMMAR4_F32_16x16x16_BF16: 6434>

WMMAR4_BF16_16x16x16_BF16 = <MMAIntrinsic.WMMAR4_BF16_16x16x16_BF16: 6435>

WMMAR4_F32_16x16x16_F8E5M2 = <MMAIntrinsic.WMMAR4_F32_16x16x16_F8E5M2: 6448>

WMMAR4_F32_16x16x16_F8E5M2_F8E4M3FN = <MMAIntrinsic.WMMAR4_F32_16x16x16_F8E5M2_F8E4M3FN: 6449>

WMMAR4_F32_16x16x16_F8E4M3FN = <MMAIntrinsic.WMMAR4_F32_16x16x16_F8E4M3FN: 6450>

WMMAR4_F32_16x16x16_F8E4M3FN_F8E5M2 = <MMAIntrinsic.WMMAR4_F32_16x16x16_F8E4M3FN_F8E5M2: 6451>

WMMAR4_I32_16x16x16_I8 = <MMAIntrinsic.WMMAR4_I32_16x16x16_I8: 6592>

WMMA_F32_16x16x4_F32 = <MMAIntrinsic.WMMA_F32_16x16x4_F32: 6672>

WMMA_F32_16x16x32_F16 = <MMAIntrinsic.WMMA_F32_16x16x32_F16: 6688>

WMMA_F32_16x16x32_BF16 = <MMAIntrinsic.WMMA_F32_16x16x32_BF16: 6689>

WMMA_F16_16x16x32_F16 = <MMAIntrinsic.WMMA_F16_16x16x32_F16: 6690>

WMMA_BF16_16x16x32_BF16 = <MMAIntrinsic.WMMA_BF16_16x16x32_BF16: 6691>

WMMA_F32_16x16x64_F8E4M3FN = <MMAIntrinsic.WMMA_F32_16x16x64_F8E4M3FN: 6704>

WMMA_F32_16x16x64_F8E4M3FN_F8E5M2 = <MMAIntrinsic.WMMA_F32_16x16x64_F8E4M3FN_F8E5M2: 6705>

WMMA_F32_16x16x64_F8E5M2 = <MMAIntrinsic.WMMA_F32_16x16x64_F8E5M2: 6706>

WMMA_F32_16x16x64_F8E5M2_F8E4M3FN = <MMAIntrinsic.WMMA_F32_16x16x64_F8E5M2_F8E4M3FN: 6707>

WMMA_F16_16x16x64_F8E4M3FN = <MMAIntrinsic.WMMA_F16_16x16x64_F8E4M3FN: 6708>

WMMA_F16_16x16x64_F8E4M3FN_F8E5M2 = <MMAIntrinsic.WMMA_F16_16x16x64_F8E4M3FN_F8E5M2: 6709>

WMMA_F16_16x16x64_F8E5M2 = <MMAIntrinsic.WMMA_F16_16x16x64_F8E5M2: 6710>

WMMA_F16_16x16x64_F8E5M2_F8E4M3FN = <MMAIntrinsic.WMMA_F16_16x16x64_F8E5M2_F8E4M3FN: 6711>

WMMA_I32_16x16x64_I8 = <MMAIntrinsic.WMMA_I32_16x16x64_I8: 6848>

WMMA_F32_16x16x128_F8E5M2 = <MMAIntrinsic.WMMA_F32_16x16x128_F8E5M2: 6712>

WMMA_F32_16x16x128_F8E5M2_F8E4M3FN = <MMAIntrinsic.WMMA_F32_16x16x128_F8E5M2_F8E4M3FN: 6713>

WMMA_F32_16x16x128_F8E4M3FN = <MMAIntrinsic.WMMA_F32_16x16x128_F8E4M3FN: 6714>

WMMA_F32_16x16x128_F8E4M3FN_F8E5M2 = <MMAIntrinsic.WMMA_F32_16x16x128_F8E4M3FN_F8E5M2: 6715>

WMMA_F16_16x16x128_F8E5M2 = <MMAIntrinsic.WMMA_F16_16x16x128_F8E5M2: 6716>

WMMA_F16_16x16x128_F8E5M2_F8E4M3FN = <MMAIntrinsic.WMMA_F16_16x16x128_F8E5M2_F8E4M3FN: 6717>

WMMA_F16_16x16x128_F8E4M3FN = <MMAIntrinsic.WMMA_F16_16x16x128_F8E4M3FN: 6718>

WMMA_F16_16x16x128_F8E4M3FN_F8E5M2 = <MMAIntrinsic.WMMA_F16_16x16x128_F8E4M3FN_F8E5M2: 6719>

NV_WMMA_F32_16x16x16_F16 = <MMAIntrinsic.NV_WMMA_F32_16x16x16_F16: 8224>

NV_WMMA_F16_16x16x16_F16 = <MMAIntrinsic.NV_WMMA_F16_16x16x16_F16: 8225>

class iree.compiler.dialects.iree_gpu.MMAIntrinsicAttr(*args, **kwargs)

property mma

property raw_value

property value

class iree.compiler.dialects.iree_gpu.PipelineOptionsAttr(*args, **kwargs)

property no_reduce_shared_memory_bank_conflicts

property prefetch_num_stages

property reorder_workgroups_strategy

property use_igemm_convolution

enum iree.compiler.dialects.iree_gpu.ReorderWorkgroupsStrategy(value)

Strategy for workgroup reordering

Member Type: int

Valid values are as follows:

None_ = <ReorderWorkgroupsStrategy.None_: 0>

Transpose = <ReorderWorkgroupsStrategy.Transpose: 1>

class iree.compiler.dialects.iree_gpu.ReorderWorkgroupsStrategyAttr(*args, **kwargs)

property raw_value

property value

enum iree.compiler.dialects.iree_gpu.ScaledMMAIntrinsic(value)

Descriptor for different scaled MMA intrinsics

Member Type: int

Valid values are as follows:

MFMA_SCALE_F32_16x16x128_B32 = <ScaledMMAIntrinsic.MFMA_SCALE_F32_16x16x128_B32: 4096>

MFMA_SCALE_F32_32x32x64_B32 = <ScaledMMAIntrinsic.MFMA_SCALE_F32_32x32x64_B32: 4097>

flag iree.compiler.dialects.iree_gpu.StorageBitwidths(value)

Supported bitwidths for storage

Member Type: int

Valid values are as follows:

B64 = <StorageBitwidths.B64: 1>

B32 = <StorageBitwidths.B32: 2>

B16 = <StorageBitwidths.B16: 4>

B8 = <StorageBitwidths.B8: 8>

flag iree.compiler.dialects.iree_gpu.SubgroupOps(value)

Supported subgroup ops

Member Type: int

Valid values are as follows:

Shuffle = <SubgroupOps.Shuffle: 1>

Arithmetic = <SubgroupOps.Arithmetic: 2>

class iree.compiler.dialects.iree_gpu.TargetInfo(*args, **kwargs)

property arch: (self) -> str

get_gpu_target_info = <nanobind.nb_func object>

property max_thread_count_per_workgroup: (self) -> int

property max_workgroup_memory_bytes: (self) -> int

property max_workgroup_sizes: (self) -> list[int]

property mma_intrinsics: (self) -> list

property simds_per_workgroup: (self) -> int

property subgroup_size_choices: (self) -> list[int]

property workgroup_count: (self) -> int

enum iree.compiler.dialects.iree_gpu.TilingLevel(value)

Descriptor for tiling levels for GPU lowering configs

Member Type: int

Valid values are as follows:

Workgroup = <TilingLevel.Workgroup: 0>

Reduction = <TilingLevel.Reduction: 1>

PartialReduction = <TilingLevel.PartialReduction: 2>

Serial = <TilingLevel.Serial: 3>

Thread = <TilingLevel.Thread: 4>

Subgroup = <TilingLevel.Subgroup: 5>

Lane = <TilingLevel.Lane: 6>

class iree.compiler.dialects.iree_gpu.ValueBarrierOp(results_, inputs, *, loc=None, ip=None)

This operation acts as a barrier on a value semantic SSA values (tensor or vector). It takes multiple operands and produces a value equivalent to each input. This does not have copy and/or data movement semantics and simply represents a barrier on all writes in the tensor case, and a barrier until all threads acquire the input vector in the vector case.

The inputs must be either all tensors, or all vectors.

This operation is a no-op when not present in a parallel context. This operation is pure as it only requires synchronization for the value it produces.

OPERATION_NAME = 'iree_gpu.value_barrier'

property inputs: iree.compiler._mlir_libs._mlir.ir.OpOperandList

property results_: iree.compiler._mlir_libs._mlir.ir.OpResultList

class iree.compiler.dialects.iree_gpu.VirtualMMAAttr(*args, **kwargs)

property abc_element_types

property abc_vector_types

property mnk_shape

enum iree.compiler.dialects.iree_gpu.VirtualMMAIntrinsic(value)

Descriptor for different Virtual MMA intrinsics

Member Type: int

Valid values are as follows:

VMFMA_F32_16x16x32_F16 = <VirtualMMAIntrinsic.VMFMA_F32_16x16x32_F16: 0>

VMFMA_F32_32x32x16_F16 = <VirtualMMAIntrinsic.VMFMA_F32_32x32x16_F16: 1>

VMFMA_F32_16x16x32_F8E4M3FNUZ = <VirtualMMAIntrinsic.VMFMA_F32_16x16x32_F8E4M3FNUZ: 2>

VMFMA_F32_32x32x16_F8E4M3FNUZ = <VirtualMMAIntrinsic.VMFMA_F32_32x32x16_F8E4M3FNUZ: 3>

class iree.compiler.dialects.iree_gpu.VirtualMMAIntrinsicAttr(*args, **kwargs)

property mma

property raw_value

property value

class iree.compiler.dialects.iree_gpu.YieldOp(values, *, loc=None, ip=None)

This operation is used to yield values from a within a region.

OPERATION_NAME = 'iree_gpu.yield'

property values: iree.compiler._mlir_libs._mlir.ir.OpOperandList

class iree.compiler.dialects.iree_gpu.auto(value=_auto_null): Instances are replaced with an appropriate value in Enum class suites.

iree.compiler.dialects.iree_gpu.barrier_region(results_, inputs, *, loc=None, ip=None) → Union[iree.compiler._mlir_libs._mlir.ir.OpResult, iree.compiler._mlir_libs._mlir.ir.OpResultList, iree.compiler.dialects._iree_gpu_ops_gen.BarrierRegionOp]

iree.compiler.dialects.iree_gpu.buffer_resource_cast(result, input, *, cache_swizzle_stride=None, loc=None, ip=None) → iree.compiler._mlir_libs._mlir.ir.OpResult

iree.compiler.dialects.iree_gpu.coalesced_gather_dma(result, source, indices, init, lane, *, loc=None, ip=None) → Union[iree.compiler._mlir_libs._mlir.ir.OpResult, iree.compiler._mlir_libs._mlir.ir.OpResultList, iree.compiler.dialects._iree_gpu_ops_gen.CoalescedGatherDMAOp]

iree.compiler.dialects.iree_gpu.register_attribute_builder(kind, replace=False)

iree.compiler.dialects.iree_gpu.value_barrier(results_, inputs, *, loc=None, ip=None) → Union[iree.compiler._mlir_libs._mlir.ir.OpResult, iree.compiler._mlir_libs._mlir.ir.OpResultList, iree.compiler.dialects._iree_gpu_ops_gen.ValueBarrierOp]

iree.compiler.dialects.iree_gpu.yield_(values, *, loc=None, ip=None) → iree.compiler.dialects._iree_gpu_ops_gen.YieldOp