Introduction to TOPI¶

Author: Ehsan M. Kermani

This is an introductory tutorial to TVM Operator Inventory (TOPI). TOPI provides numpy-style generic operations and schedules with higher abstractions than TVM. In this tutorial, we will see how TOPI can save us from writing boilerplates code in TVM.

from __future__ import absolute_import, print_function

import tvm
import topi
import numpy as np


Basic example¶

Let’s revisit the sum of rows operation (equivalent to B = numpy.sum(A, axis=1)’) To compute the sum of rows of a two dimensional TVM tensor A, we should specify the symbolic operation as well as schedule as follows

n = tvm.var("n")
m = tvm.var("m")
A = tvm.placeholder((n, m), name='A')
k = tvm.reduce_axis((0, m), "k")
B = tvm.compute((n,), lambda i: tvm.sum(A[i, k], axis=k), name="B")
s = tvm.create_schedule(B.op)


and to examine the IR code in human readable format, we can do

print(tvm.lower(s, [A], simple_mode=True))


Out:

// attr [B] storage_scope = "global"
allocate B[float32 * n]
produce B {
for (i, 0, n) {
B[i] = 0.000000f
for (k, 0, m) {
B[i] = (B[i] + A[((i*m) + k)])
}
}
}


However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with tvm.compute. Imagine for more complicated operations how much details we need to provide. Fortunately, we can replace those two lines with simple topi.sum much like numpy.sum

C = topi.sum(A, axis=1)
ts = tvm.create_schedule(C.op)
print(tvm.lower(ts, [A], simple_mode=True))


Out:

// attr [A_red] storage_scope = "global"
allocate A_red[float32 * n]
produce A_red {
for (ax0, 0, n) {
A_red[ax0] = 0.000000f
for (k1, 0, m) {
A_red[ax0] = (A_red[ax0] + A[((ax0*m) + k1)])
}
}
}


We can add two tensors using topi.broadcast_add that have correct (broadcastable with specific) shapes. Even shorter, TOPI provides operator overloading for such common operations. For example,

x, y = 100, 10
a = tvm.placeholder((x, y, y), name="a")
b = tvm.placeholder((y, y), name="b")
d = a * b  # same as topi.broadcast_mul


Overloaded with the same syntax, TOPI handles broadcasting a primitive (int, float) to a tensor d - 3.14.

Generic schedules and fusing operations¶

Up to now, we have seen an example of how TOPI can save us from writing explicit computations in lower level API. But it doesn’t stop here. Still we did the scheduling as before. TOPI also provides higher level scheduling recipes depending on a given context. For example, for CUDA, we can schedule the following series of operations ending with topi.sum using only topi.generic.schedule_reduce

e = topi.elemwise_sum([c, d])
f = e / 2.0
g = topi.sum(f)
with tvm.target.cuda():
sg = topi.generic.schedule_reduce(g)
print(tvm.lower(sg, [a, b], simple_mode=True))


Out:

// attr [T_divide_red] storage_scope = "global"
allocate T_divide_red[float32 * 1]
produce T_divide_red {
// attr [T_divide_red.rf] storage_scope = "local"
allocate T_divide_red.rf[float32 * 1]
// attr [reduce_temp0] storage_scope = "local"
allocate reduce_temp0[float32 * 1]
produce T_divide_red.rf {
T_divide_red.rf[0] = 0.000000f
for (k0.k1.fused.k2.fused.outer, 0, 20) {
if (((k0.k1.fused.k2.fused.outer*512) < (10000 - threadIdx.x))) {
}
}
}
// attr [comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0.000000f])] reduce_scope = reinterpret((uint64)0)
T_divide_red[0] = reduce_temp0[0]
}
}


As you can see, scheduled stages of computation have been accumulated and we can examine them by

print(sg.stages)


Out:

[stage(a, 0x7f6f3bf96a70), stage(b, 0x7f6eb52f87f0), stage(T_add, 0x7f6ec7a31a80), stage(T_multiply, 0x16faf320), stage(T_elemwise_sum, 0x7f6f402f9340), stage(T_divide, 0x7f6eccd07d20), stage(T_divide_red.rf, 0x7f6e425d1b90), stage(T_divide_red, 0x7f6eb52f11d0)]


We can test the correctness by comparing with numpy result as follows

func = tvm.build(sg, [a, b, g], 'cuda')
ctx = tvm.gpu(0)
a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype)
b_np = np.random.uniform(size=(y, y)).astype(b.dtype)
g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0)
a_nd = tvm.nd.array(a_np, ctx)
b_nd = tvm.nd.array(b_np, ctx)
g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)
func(a_nd, b_nd, g_nd)
tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-5)


TOPI also provides common neural nets operations such as _softmax_ with optimized schedule

tarray = tvm.placeholder((512, 512), name="tarray")
softmax_topi = topi.nn.softmax(tarray)
with tvm.target.create("cuda"):
sst = topi.generic.schedule_softmax(softmax_topi)
print(tvm.lower(sst, [tarray], simple_mode=True))


Out:

// attr [T_softmax_maxelem] storage_scope = "global"
allocate T_softmax_maxelem[float32 * 512]
// attr [T_softmax_expsum] storage_scope = "global"
allocate T_softmax_expsum[float32 * 512]
// attr [T_softmax_norm] storage_scope = "global"
allocate T_softmax_norm[float32 * 262144]
produce T_softmax_maxelem {
// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 512
T_softmax_maxelem[blockIdx.x] = -340282346638528859811704183484516925440.000000f
for (k, 0, 512) {
T_softmax_maxelem[blockIdx.x] = max(T_softmax_maxelem[blockIdx.x], tarray[((blockIdx.x*512) + k)])
}
}
produce T_softmax_expsum {
// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 512
// attr [T_softmax_expsum.rf] storage_scope = "local"
allocate T_softmax_expsum.rf[float32 * 1]
// attr [reduce_temp0] storage_scope = "local"
allocate reduce_temp0[float32 * 1]
produce T_softmax_expsum.rf {
T_softmax_expsum.rf[0] = 0.000000f
for (k.outer, 0, 8) {
T_softmax_expsum.rf[0] = (T_softmax_expsum.rf[0] + exp((tarray[((((blockIdx.x*8) + k.outer)*64) + threadIdx.x)] - T_softmax_maxelem[blockIdx.x])))
}
}
// attr [comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0.000000f])] reduce_scope = reinterpret((uint64)0)
T_softmax_expsum[blockIdx.x] = reduce_temp0[0]
}
}
produce T_softmax_norm {
// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 512
for (i1.inner, 0, 8) {
T_softmax_norm[((((blockIdx.x*64) + threadIdx.x)*8) + i1.inner)] = (exp((tarray[((((blockIdx.x*64) + threadIdx.x)*8) + i1.inner)] - T_softmax_maxelem[blockIdx.x]))/T_softmax_expsum[blockIdx.x])
}
}


Fusing convolutions¶

We can fuse topi.nn.conv2d and topi.nn.relu together.

Note

TOPI functions are all generic functions. They have different implementations for different backends to optimize for performance. For each backend, it is necessary to call them under a target scope for both compute declaration and schedule. TVM will choose the right function to call with the target information.

data = tvm.placeholder((1, 3, 224, 224))
kernel = tvm.placeholder((10, 3, 5, 5))

with tvm.target.create("cuda"):
conv = topi.nn.conv2d(data, kernel, strides=1, padding=2, dilation=1)
out = topi.nn.relu(conv)
sconv = topi.generic.nn.schedule_conv2d_nchw(out)
print(tvm.lower(sconv, [data, kernel], simple_mode=True))


Out:

// attr [compute] storage_scope = "global"
allocate compute[float32 * 501760]
produce compute {
// attr [iter_var(blockIdx.z, , blockIdx.z)] thread_extent = 5
// attr [compute] storage_scope = "local"
allocate compute[float32 * 16]
// attr [pad_temp.shared] storage_scope = "shared"
// attr [placeholder.shared] storage_scope = "shared"
allocate placeholder.shared[float32 * 2]
// attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 28
// attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 14
produce compute {
compute[0] = 0.000000f
compute[1] = 0.000000f
compute[2] = 0.000000f
compute[3] = 0.000000f
compute[4] = 0.000000f
compute[5] = 0.000000f
compute[6] = 0.000000f
compute[7] = 0.000000f
compute[8] = 0.000000f
compute[9] = 0.000000f
compute[10] = 0.000000f
compute[11] = 0.000000f
compute[12] = 0.000000f
compute[13] = 0.000000f
compute[14] = 0.000000f
compute[15] = 0.000000f
for (rc.outer, 0, 3) {
for (ry.outer, 0, 5) {
pad_temp.shared[(((((((threadIdx.x*8) + 1)/128)*16) + threadIdx.x)*8) + 1)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 1)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 1)/16)))) && ((2 - (((threadIdx.x*8) + 1) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (226 - (((threadIdx.x*8) + 1) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 1)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 1)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 1) % 16)) + -450)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 2)/128)*16) + threadIdx.x)*8) + 2)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 2)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 2)/16)))) && ((2 - (((threadIdx.x*8) + 2) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (226 - (((threadIdx.x*8) + 2) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 2)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 2)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 2) % 16)) + -450)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 3)/128)*16) + threadIdx.x)*8) + 3)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 3)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 3)/16)))) && ((2 - (((threadIdx.x*8) + 3) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (226 - (((threadIdx.x*8) + 3) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 3)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 3)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 3) % 16)) + -450)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 4)/128)*16) + threadIdx.x)*8) + 4)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 4)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 4)/16)))) && ((2 - (((threadIdx.x*8) + 4) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (226 - (((threadIdx.x*8) + 4) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 4)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 4)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 4) % 16)) + -450)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 5)/128)*16) + threadIdx.x)*8) + 5)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 5)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 5)/16)))) && ((2 - (((threadIdx.x*8) + 5) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (226 - (((threadIdx.x*8) + 5) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 5)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 5)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 5) % 16)) + -450)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 6)/128)*16) + threadIdx.x)*8) + 6)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 6)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 6)/16)))) && ((2 - (((threadIdx.x*8) + 6) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (226 - (((threadIdx.x*8) + 6) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 6)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 6)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 6) % 16)) + -450)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 7)/128)*16) + threadIdx.x)*8) + 7)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 7)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 7)/16)))) && ((2 - (((threadIdx.x*8) + 7) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (226 - (((threadIdx.x*8) + 7) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 7)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 7)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 7) % 16)) + -450)], 0.000000f)
}
produce placeholder.shared {
if (likely(((blockIdx.z*2) < (10 - threadIdx.x)))) {
}
}
}
pad_temp.shared[(((((((threadIdx.x*8) + 1)/128)*16) + threadIdx.x)*8) + 1)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 1)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 1)/16)))) && ((1 - (((threadIdx.x*8) + 1) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((threadIdx.x*8) + 1) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 1)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 1)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 1) % 16)) + -449)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 2)/128)*16) + threadIdx.x)*8) + 2)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 2)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 2)/16)))) && ((1 - (((threadIdx.x*8) + 2) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((threadIdx.x*8) + 2) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 2)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 2)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 2) % 16)) + -449)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 3)/128)*16) + threadIdx.x)*8) + 3)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 3)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 3)/16)))) && ((1 - (((threadIdx.x*8) + 3) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((threadIdx.x*8) + 3) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 3)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 3)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 3) % 16)) + -449)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 4)/128)*16) + threadIdx.x)*8) + 4)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 4)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 4)/16)))) && ((1 - (((threadIdx.x*8) + 4) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((threadIdx.x*8) + 4) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 4)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 4)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 4) % 16)) + -449)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 5)/128)*16) + threadIdx.x)*8) + 5)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 5)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 5)/16)))) && ((1 - (((threadIdx.x*8) + 5) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((threadIdx.x*8) + 5) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 5)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 5)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 5) % 16)) + -449)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 6)/128)*16) + threadIdx.x)*8) + 6)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 6)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 6)/16)))) && ((1 - (((threadIdx.x*8) + 6) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((threadIdx.x*8) + 6) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 6)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 6)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 6) % 16)) + -449)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 7)/128)*16) + threadIdx.x)*8) + 7)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 7)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 7)/16)))) && ((1 - (((threadIdx.x*8) + 7) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((threadIdx.x*8) + 7) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 7)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 7)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 7) % 16)) + -449)], 0.000000f)
}
produce placeholder.shared {
if (likely(((blockIdx.z*2) < (10 - threadIdx.x)))) {
}
}
}
pad_temp.shared[(((((((threadIdx.x*8) + 1)/128)*16) + threadIdx.x)*8) + 1)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 1)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 1)/16)))) && ((0 - (((threadIdx.x*8) + 1) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (224 - (((threadIdx.x*8) + 1) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 1)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 1)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 1) % 16)) + -448)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 2)/128)*16) + threadIdx.x)*8) + 2)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 2)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 2)/16)))) && ((0 - (((threadIdx.x*8) + 2) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (224 - (((threadIdx.x*8) + 2) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 2)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 2)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 2) % 16)) + -448)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 3)/128)*16) + threadIdx.x)*8) + 3)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 3)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 3)/16)))) && ((0 - (((threadIdx.x*8) + 3) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (224 - (((threadIdx.x*8) + 3) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 3)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 3)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 3) % 16)) + -448)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 4)/128)*16) + threadIdx.x)*8) + 4)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 4)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 4)/16)))) && ((0 - (((threadIdx.x*8) + 4) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (224 - (((threadIdx.x*8) + 4) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 4)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 4)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 4) % 16)) + -448)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 5)/128)*16) + threadIdx.x)*8) + 5)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 5)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 5)/16)))) && ((0 - (((threadIdx.x*8) + 5) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (224 - (((threadIdx.x*8) + 5) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 5)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 5)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 5) % 16)) + -448)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 6)/128)*16) + threadIdx.x)*8) + 6)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 6)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 6)/16)))) && ((0 - (((threadIdx.x*8) + 6) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (224 - (((threadIdx.x*8) + 6) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 6)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 6)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 6) % 16)) + -448)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 7)/128)*16) + threadIdx.x)*8) + 7)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 7)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 7)/16)))) && ((0 - (((threadIdx.x*8) + 7) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (224 - (((threadIdx.x*8) + 7) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 7)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 7)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 7) % 16)) + -448)], 0.000000f)
}
produce placeholder.shared {
if (likely(((blockIdx.z*2) < (10 - threadIdx.x)))) {
}
}
}
pad_temp.shared[(((((((threadIdx.x*8) + 1)/128)*16) + threadIdx.x)*8) + 1)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 1)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 1)/16)))) && ((-1 - (((threadIdx.x*8) + 1) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (223 - (((threadIdx.x*8) + 1) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 1)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 1)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 1) % 16)) + -447)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 2)/128)*16) + threadIdx.x)*8) + 2)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 2)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 2)/16)))) && ((-1 - (((threadIdx.x*8) + 2) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (223 - (((threadIdx.x*8) + 2) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 2)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 2)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 2) % 16)) + -447)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 3)/128)*16) + threadIdx.x)*8) + 3)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 3)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 3)/16)))) && ((-1 - (((threadIdx.x*8) + 3) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (223 - (((threadIdx.x*8) + 3) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 3)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 3)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 3) % 16)) + -447)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 4)/128)*16) + threadIdx.x)*8) + 4)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 4)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 4)/16)))) && ((-1 - (((threadIdx.x*8) + 4) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (223 - (((threadIdx.x*8) + 4) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 4)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 4)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 4) % 16)) + -447)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 5)/128)*16) + threadIdx.x)*8) + 5)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 5)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 5)/16)))) && ((-1 - (((threadIdx.x*8) + 5) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (223 - (((threadIdx.x*8) + 5) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 5)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 5)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 5) % 16)) + -447)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 6)/128)*16) + threadIdx.x)*8) + 6)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 6)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 6)/16)))) && ((-1 - (((threadIdx.x*8) + 6) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (223 - (((threadIdx.x*8) + 6) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 6)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 6)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 6) % 16)) + -447)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 7)/128)*16) + threadIdx.x)*8) + 7)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 7)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 7)/16)))) && ((-1 - (((threadIdx.x*8) + 7) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (223 - (((threadIdx.x*8) + 7) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 7)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 7)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 7) % 16)) + -447)], 0.000000f)
}
produce placeholder.shared {
if (likely(((blockIdx.z*2) < (10 - threadIdx.x)))) {
}
}
}
pad_temp.shared[(((((((threadIdx.x*8) + 1)/128)*16) + threadIdx.x)*8) + 1)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 1)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 1)/16)))) && ((-2 - (((threadIdx.x*8) + 1) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (222 - (((threadIdx.x*8) + 1) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 1)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 1)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 1) % 16)) + -446)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 2)/128)*16) + threadIdx.x)*8) + 2)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 2)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 2)/16)))) && ((-2 - (((threadIdx.x*8) + 2) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (222 - (((threadIdx.x*8) + 2) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 2)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 2)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 2) % 16)) + -446)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 3)/128)*16) + threadIdx.x)*8) + 3)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 3)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 3)/16)))) && ((-2 - (((threadIdx.x*8) + 3) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (222 - (((threadIdx.x*8) + 3) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 3)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 3)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 3) % 16)) + -446)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 4)/128)*16) + threadIdx.x)*8) + 4)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 4)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 4)/16)))) && ((-2 - (((threadIdx.x*8) + 4) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (222 - (((threadIdx.x*8) + 4) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 4)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 4)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 4) % 16)) + -446)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 5)/128)*16) + threadIdx.x)*8) + 5)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 5)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 5)/16)))) && ((-2 - (((threadIdx.x*8) + 5) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (222 - (((threadIdx.x*8) + 5) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 5)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 5)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 5) % 16)) + -446)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 6)/128)*16) + threadIdx.x)*8) + 6)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 6)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 6)/16)))) && ((-2 - (((threadIdx.x*8) + 6) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (222 - (((threadIdx.x*8) + 6) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 6)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 6)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 6) % 16)) + -446)], 0.000000f)
pad_temp.shared[(((((((threadIdx.x*8) + 7)/128)*16) + threadIdx.x)*8) + 7)] = tvm_if_then_else(((((((2 - ry.outer) - (((threadIdx.x*8) + 7)/16)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < ((226 - ry.outer) - (((threadIdx.x*8) + 7)/16)))) && ((-2 - (((threadIdx.x*8) + 7) % 16)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (222 - (((threadIdx.x*8) + 7) % 16)))), placeholder[(((((((((((((((threadIdx.x*8) + 7)/128)*3) + rc.outer)*28) + blockIdx.y)*8) + (((threadIdx.x*8) + 7)/16)) + ry.outer)*14) + blockIdx.x)*16) + (((threadIdx.x*8) + 7) % 16)) + -446)], 0.000000f)
}
produce placeholder.shared {
if (likely(((blockIdx.z*2) < (10 - threadIdx.x)))) {
}
}
}
}
}
}
compute[((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x)] = max(compute[0], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 224)] = max(compute[1], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 448)] = max(compute[2], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 672)] = max(compute[3], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 896)] = max(compute[4], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 1120)] = max(compute[5], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 1344)] = max(compute[6], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 1568)] = max(compute[7], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 50176)] = max(compute[8], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 50400)] = max(compute[9], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 50624)] = max(compute[10], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 50848)] = max(compute[11], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 51072)] = max(compute[12], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 51296)] = max(compute[13], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 51520)] = max(compute[14], 0.000000f)
compute[(((((((blockIdx.z*56) + blockIdx.y)*112) + blockIdx.x)*16) + threadIdx.x) + 51744)] = max(compute[15], 0.000000f)
}


Summary¶

In this tutorial, we have seen

• How to use TOPI API for common operations with numpy-style operators.
• How TOPI facilitates generic schedules and operator fusion for a context, to generate optimized kernel codes.

Total running time of the script: ( 0 minutes 1.093 seconds)

Gallery generated by Sphinx-Gallery