Deploy Single Shot Multibox Detector(SSD) model

Author: Yao Wang, Leyuan Wang

This article is an introductory tutorial to deploy SSD models with TVM. We will use mxnet pretrained SSD model with Resnet50 as body network and convert it to NNVM graph;

import os
import zipfile
import tvm
import mxnet as mx
import cv2
import numpy as np

from nnvm import compiler
from nnvm.frontend import from_mxnet
from tvm import relay
from import download
from tvm.contrib import graph_runtime
from mxnet.model import load_checkpoint

Preliminary and Set parameters

We should build TVM with sort support, in TVM root directory

echo "set(USE_SORT ON)" >
make -j8
model_name = "ssd_resnet50_512"
model_file = "" % model_name
test_image = "dog.jpg"
dshape = (1, 3, 512, 512)
dtype = "float32"

# Target settings
# Use these commented settings to build for cuda.
#target = 'cuda'
#ctx = tvm.gpu(0)
# Use these commented settings to build for opencl.
#target = 'opencl'
#ctx = tvm.opencl(0)
target = "llvm"
ctx = tvm.cpu()

Download MXNet SSD pre-trained model and demo image

Pre-trained model available at

model_url = "" \
image_url = "" \
inference_symbol_folder = \
inference_symbol_url = "" \

dir = "ssd_model"
if not os.path.exists(dir):
model_file_path = "%s/%s" % (dir, model_file)
test_image_path = "%s/%s" % (dir, test_image)
inference_symbol_path = "%s/" % dir
download(model_url, model_file_path)
download(image_url, test_image_path)
download(inference_symbol_url, inference_symbol_path)

zip_ref = zipfile.ZipFile(model_file_path, 'r')
zip_ref = zipfile.ZipFile(inference_symbol_path)


File ssd_model/ exists, skip.
File ssd_model/dog.jpg exists, skip.
File ssd_model/ exists, skip.

Convert and compile model with NNVM or Relay for CPU.

sym = mx.sym.load("%s/%s/ssd_resnet50_inference.json" % (dir, inference_symbol_folder))
_, arg_params, aux_params = load_checkpoint("%s/%s" % (dir, model_name), 0)

import argparse
parser = argparse.ArgumentParser()
    "-f", "--frontend",
    help="Frontend for compilation, nnvm or relay",
args = parser.parse_args()
if args.frontend == "relay":
    net, params = relay.frontend.from_mxnet(sym, {"data": dshape}, arg_params=arg_params, \
    with relay.build_config(opt_level=3):
        graph, lib, params =, target, params=params)
elif args.frontend == "nnvm":
    net, params = from_mxnet(sym, arg_params, aux_params)
    with compiler.build_config(opt_level=3):
        graph, lib, params =
            net, target, {"data": dshape}, params=params)

Create TVM runtime and do inference

# Preprocess image
image = cv2.imread(test_image_path)
img_data = cv2.resize(image, (dshape[2], dshape[3]))
img_data = img_data[:, :, (2, 1, 0)].astype(np.float32)
img_data -= np.array([123, 117, 104])
img_data = np.transpose(np.array(img_data), (2, 0, 1))
img_data = np.expand_dims(img_data, axis=0)
# Build TVM runtime
m = graph_runtime.create(graph, lib, ctx)
m.set_input('data', tvm.nd.array(img_data.astype(dtype)))
# execute
# get outputs
tvm_output = m.get_output(0)

Display result

class_names = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair",
               "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant",
               "sheep", "sofa", "train", "tvmonitor"]
def display(img, out, thresh=0.5):
    import random
    import matplotlib as mpl
    import matplotlib.pyplot as plt
    mpl.rcParams['figure.figsize'] = (10, 10)
    pens = dict()
    for det in out:
        cid = int(det[0])
        if cid < 0:
        score = det[1]
        if score < thresh:
        if cid not in pens:
            pens[cid] = (random.random(), random.random(), random.random())
        scales = [img.shape[1], img.shape[0]] * 2
        xmin, ymin, xmax, ymax = [int(p * s) for p, s in zip(det[2:6].tolist(), scales)]
        rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False,
                             edgecolor=pens[cid], linewidth=3)
        text = class_names[cid]
        plt.gca().text(xmin, ymin-2, '{:s} {:.3f}'.format(text, score),
                       bbox=dict(facecolor=pens[cid], alpha=0.5),
                       fontsize=12, color='white')

image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
display(image, tvm_output.asnumpy()[0], thresh=0.45)

Total running time of the script: ( 0 minutes 20.034 seconds)

Gallery generated by Sphinx-Gallery