@enigma.jit
def tiled_add(mA: enigma.Tensor, mB: enigma.Tensor, mC: enigma.Tensor):
# Layout arithmetic — runs at compile time
thr = enigma.make_ordered_layout((4, 64), order=(1, 0))
val = enigma.make_ordered_layout((4, 4), order=(1, 0))
tiler_mn, tv_layout = enigma.make_layout_tv(thr, val)
gA = enigma.tensor_zipped_divide(mA, tiler_mn)
gB = enigma.tensor_zipped_divide(mB, tiler_mn)
gC = enigma.tensor_zipped_divide(mC, tiler_mn)
# Inner kernel — dispatched for each tile
@enigma.kernel
def inner(blkA, blkB, blkC, tv):
thread_idx = enigma.thread_position_in_grid
thrA = enigma.tensor_composition(blkA, tv, tiler_mn)[(thread_idx, None)]
thrB = enigma.tensor_composition(blkB, tv, tiler_mn)[(thread_idx, None)]
thrC = enigma.tensor_composition(blkC, tv, tiler_mn)[(thread_idx, None)]
thrC.store(thrA.load() + thrB.load())
inner.launch(grid=..., block=...)