TVM 初次使用体验
TVM 官方网: https://tvm.apache.org/
一次偶然的机会听说到tvm, 一直比较茫然,不知道从哪里下手开始学习, 故开始了本次实验之路…..
环境
系统:ubuntu16.04
架构: X86
cpu1:Intel(R) Core(TM) i5-3210M CPU @ 2.50GHz(笔记本)
cpu2:Intel(R) Xeon(R) Gold 6136 CPU @ 3.00GHz(服务器)
编译安装
下载源码
git clone --recursive https://github.com/apache/tvm tvm
git submodule init
git submodule update
安装基础环境
sudo apt-get update
sudo apt-get install -y python3 python3-dev python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev
编译要求:
- c++ 编译需要支持 c++ 14(g++ >=5)
- cmake >= 3.5
- 如果需要使用cuda, cuda版本应大于8.0
- 推荐使用更高版本的llvm
llvm 安装
从 https://github.com/llvm/llvm-project/releases 下载合适的版本,
本次实验下载llvm11.0 下载后解压llvm
编译tvm
cd tvm
mkdir build
cp cmake/config.cmake build
打开cmake/config.cmake, 修改配置。
本次实验主要针对X86架构cpu, 配置主要是针对llvm配置
将刚刚解压的llvm路径写入配置中
cmake ..
make -j8
等待编译完成!
python 安装
cd ../python
python3 setup.py install
到此tvm编译安装完成
模型测试
本次实验采用torchvision 中的resnet18 模型
测试 torch 模型
import time
import torch
import torchvision
model = torchvision.models.resnet18(pretrained=False)
example = torch.rand(1, 3, 224, 224)
pre_dict = torch.load('./resnet18-5c106cde.pth', map_location='cpu')
model.load_state_dict(pre_dict, strict=True)
with torch.no_grad():
model.eval()
since = time.time()
for i in range(1000):
out = model(example)
time_elapsed = time.time() - since
print('Time elapsed is {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed
pytorch 转 onnx
model = torchvision.models.resnet18(pretrained=False)
example = torch.rand(1, 3, 224, 224) # 假想输入
pre_dict = torch.load('./resnet18-5c106cde.pth', map_location='cpu')
model.load_state_dict(pre_dict, strict=True)
torch_out = torch.onnx.export(model,
example,
"resnet18.onnx",
verbose=True,
export_params=True # 带参数输出
)
测试 onnx 模型
import onnxruntime
mean = [123., 117., 104.] # 在ImageNet上训练数据集的mean和std
std = [58.395, 57.12, 57.375]
def transform_image(image): # 定义转化函数,将PIL格式的图像转化为格式维度的numpy格式数组
image = image - np.array(mean)
image /= np.array(std)
image = np.array(image).transpose((2, 0, 1))
image = image[np.newaxis, :].astype('float32')
return image
img = Image.open('./air.jpg').resize((224, 224)) # 这里我们将图像resize为特定大小
x = transform_image(img)
ort_session = onnxruntime.InferenceSession("resnet18.onnx")
# compute ONNX Runtime output prediction
print(ort_session.get_inputs()[0].name)
ort_inputs = {ort_session.get_inputs()[0].name: x}
ort_outs = ort_session.run(None, ort_inputs)
print(np.argmax(ort_outs))
since = time.time()
for i in range(1000):
ort_inputs = {ort_session.get_inputs()[0].name: x}
ort_outs = ort_session.run(None, ort_inputs)
time_elapsed = time.time() - since
print('Time elapsed is {:.0f}m {:.0f}s'.
format(time_elapsed // 60, time_elapsed % 60)) # 打印出来时间
tvm 编译 onnx 模型
onnx_model = onnx.load('resnet18.onnx') # 导入模型
mean = [123., 117., 104.] # 在ImageNet上训练数据集的mean和std
std = [58.395, 57.12, 57.375]
def transform_image(image):
image = image - np.array(mean)
image /= np.array(std)
image = np.array(image).transpose((2, 0, 1))
image = image[np.newaxis, :].astype('float32')
return image
img = Image.open('./air.jpg').resize((224, 224)) # 这里我们将图像resize为特定大小
x = transform_image(img)
target = 'llvm -mcpu='
input_name = 'input.1'
shape_dict = {input_name: x.shape}
sym, params = relay.frontend.from_onnx(onnx_model, shape_dict)
# print("tvm...")
with tvm.transform.PassContext(opt_level=3):
lib = relay.build_module.build(sym, target=target, params=params)
# upload parameters to device
dev = tvm.cpu()
module = runtime.GraphModule(lib["default"](dev))
module.set_input(input_name, tvm.nd.array(x.astype("float32")))
module.run()
out = module.get_output(0, tvm.nd.empty(((1, 1000)), "float32"))
print(np.argmax(out.asnumpy()[0]))
since = time.time()
for i in range(1000):
module.set_input(input_name, tvm.nd.array(x.astype("float32")))
module.run()
out = module.get_output(0, tvm.nd.empty(((1, 1000)), "float32"))
time_elapsed = time.time() - since
print('Time elapsed is {:.0f}m {:.0f}s'.
format(time_elapsed // 60, time_elapsed % 60)) # 打印出来时间
auto-tuning
在tvm编译模型时并没有调优,性能不是很好,因此需要使用auto-tuning针对不同设备进行搜索, 本次建立的是本地搜索,可以对远程设备进行搜索。
官方网站 https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_x86.html#sphx-glr-tutorials-autotvm-tune-relay-x86-py
import os
import numpy as np
import onnx
import tvm
from tvm import relay, autotvm
from tvm.relay import testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
import tvm.contrib.graph_executor as runtime
# Replace "llvm" with the correct target of your CPU.
# For example, for AWS EC2 c5 instance with Intel Xeon
# Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512".
# For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be
# "llvm -mcpu=core-avx2".
target = "llvm -mcpu=core-avx2" # 针对不同设备进行设置, 如果设置不对,将搜索不到有效Schedules
batch_size = 1 # 设置好后,编译后不能更改
dtype = "float32"
model_name = "resnet18"
log_file = "%s.log" % model_name
graph_opt_sch_file = "%s_graph_opt.log" % model_name
# Set the input name of the graph
# For ONNX models, it is typically "0".
input_name = 'data' # 'input.1' # "data"
# Set number of threads used for tuning based on the number of
# physical CPU cores on your machine.
num_threads = 4
os.environ["TVM_NUM_THREADS"] = str(num_threads)
tuning_option = {
"log_filename": log_file,
"tuner": "xgb", # random 设置搜索使用的方式
"early_stopping": 600, #
"measure_option": autotvm.measure_option(
builder=autotvm.LocalBuilder(),
runner=autotvm.LocalRunner(
number=4, repeat=3, min_repeat_ms=150, enable_cpu_cache_flush=True
),
),
}
# You can skip the implementation of this function for this tutorial.
def tune_kernels(tasks, measure_option, tuner="gridsearch", early_stopping=None, log_filename="tuning.log"):
for i, task in enumerate(tasks):
prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
# create tuner
if tuner == "xgb" or tuner == "xgb-rank":
tuner_obj = XGBTuner(task, loss_type="rank")
elif tuner == "ga":
tuner_obj = GATuner(task, pop_size=50)
elif tuner == "random":
tuner_obj = RandomTuner(task)
elif tuner == "gridsearch":
tuner_obj = GridSearchTuner(task)
else:
raise ValueError("Invalid tuner: " + tuner)
# do tuning
n_trial = len(task.config_space)
tuner_obj.tune(
n_trial=n_trial,
early_stopping=early_stopping,
measure_option=measure_option,
callbacks=[
autotvm.callback.progress_bar(n_trial, prefix=prefix),
autotvm.callback.log_to_file(log_filename),
],
)
# Use graph tuner to achieve graph level optimal schedules
# Set use_DP=False if it takes too long to finish.
def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True):
target_op = [
relay.op.get("nn.conv2d")
]
Tuner = DPTuner if use_DP else PBQPTuner
executor = Tuner(graph, {input_name: dshape}, records, target_op, target)
executor.benchmark_layout_transform(min_exec_num=2000)
executor.run()
executor.write_opt_sch2record_file(opt_sch_file)
def tune_and_evaluate(tuning_opt):
# extract workloads from relay program
print("Extract tasks...")
# mod, params, data_shape, out_shape = get_network(model_name, batch_size)
onnx_model = onnx.load('resnet18.onnx') # 导入模型
input_name = 'input.1'
shape_dict = {input_name: [batch_size, 3, 224, 224]}
data_shape = (batch_size, 3, 224, 224)
out_shape = (batch_size, 1000)
mod, params = relay.frontend.from_onnx(onnx_model, shape_dict)
tasks = autotvm.task.extract_from_program(
mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)
)
# run tuning tasks
tune_kernels(tasks, **tuning_opt)
tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)
# compile kernels with graph-level best records
with autotvm.apply_graph_best(graph_opt_sch_file):
print("Compile...")
with tvm.transform.PassContext(opt_level=3):
lib = relay.build_module.build(mod, target=target, params=params)
# upload parameters to device
dev = tvm.cpu()
data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
module = runtime.GraphModule(lib["default"](dev))
module.set_input(input_name, data_tvm)
# evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", dev, number=100, repeat=3)
prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
print(
"Mean inference time (std dev): %.2f ms (%.2f ms)"
% (np.mean(prof_res), np.std(prof_res))
)
# We do not run the tuning in our webpage server since it takes too long.
# Uncomment the following line to run it by yourself.
# tune_and_evaluate(tuning_option)
if __name__ == '__main__':
tune_and_evaluate(tuning_option)
搜索完成后最优结果将保持在 graph_opt_sch_file 文件中
auto-tuning后tvm测试
对 auto-tuning 后的结果进行测试
onnx_model = onnx.load('resnet18.onnx') # 导入模型
# onnx.checker.check_model(onnx_model)
mean = [123., 117., 104.] # 在ImageNet上训练数据集的mean和std
std = [58.395, 57.12, 57.375]
def transform_image(image): # 定义转化函数,将PIL格式的图像转化为格式维度的numpy格式数组
image = image - np.array(mean)
image /= np.array(std)
image = np.array(image).transpose((2, 0, 1))
image = image[np.newaxis, :].astype('float32')
return image
img = Image.open('./air.jpg').resize((224, 224)) # 这里我们将图像resize为特定大小
x = transform_image(img)
target = "llvm -mcpu=core-avx2"
input_name = 'input.1'
shape_dict = {input_name: x.shape}
sym, params = relay.frontend.from_onnx(onnx_model, shape_dict)
dtype = 'float32'
graph_opt_sch_file = "./resnet18_graph_opt.log"
with autotvm.apply_graph_best(graph_opt_sch_file):
print("Compile...")
with tvm.transform.PassContext(opt_level=3):
lib = relay.build_module.build(sym, target=target, params=params)
# upload parameters to device
dev = tvm.cpu(1)
# data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
module = runtime.GraphModule(lib["default"](dev))
module.set_input(input_name, tvm.nd.array(x.astype(dtype)))
module.run()
out = module.get_output(0) # , tvm.nd.empty(((1, 1000)), "float32"))
print(np.argmax(out.asnumpy()[0]))
since = time.time()
for i in range(1000):
module.set_input(input_name, tvm.nd.array(x.astype(dtype)))
module.run()
out = module.get_output(0) # , tvm.nd.empty(((1, 1000)), "float32"))
# output = func(tvm.nd.array(x.astype(dtype)), **params).asnumpy()
time_elapsed = time.time() - since
print('Time elapsed is {:.0f}m {:.0f}s'.
format(time_elapsed // 60, time_elapsed % 60)) # 打印出来时间
速度对比
cpu类型 | onnx | torch | tvm(auto-tuning) | tvm |
---|---|---|---|---|
cpu1 | 51ms | 80ms | 150ms | 150 ms |
cpu2 | 5ms | 31ms | 7ms | 11ms |
cpu1 为笔记本所用,算力较小, lscpu查看,发现不支持avx512及avx2, 在auto-tuning时没有找到有效的schedules,因此速度超慢
Reference
[1] https://tvm.apache.org/
[2] https://bbs.huaweicloud.com/blogs/224847
[3] https://zhuanlan.zhihu.com/p/159025396
[4] https://zhuanlan.zhihu.com/p/88369758