train-preservation

训练中保存云端部署支持的模型结构

云端部署支持 Tensorflow,Keras,Oneflow 三种框架的 Savedmodel 格式保存的模型进行推理。Pytorch 支持内置模型 .pth 格式的模型的推理。

Tensorflow

TF1

  • 使用 tf.saved_model.builder API 保存
...
# 训练结束后定义builder保存模型
builder = tf.saved_model.builder.SavedModelBuilder(model_save_path)
# 定义模型的输入和输出到signature
with tf.compat.v1.Session(graph=tf.Graph()) as sess:
...
# 其中x对应分类模型的输入tensor, y对应分类模型中的输出tensor
signature = tf.saved_model.predict_signature_def(inputs={'Input': x}, outputs={'probability': y})
builder.add_meta_graph_and_variables(sess=sess,
tags=[tag_constants.SERVING],
signature_def_map=signature)
...
builder.save()
...

官方文档: https://www.tensorflow.org/api_docs/python/tf/compat/v1/saved_model/Builder

TF2

  • 使用 tf.saved_model API 保存
# 定义模型,训练等
...
# 保存模型,定义signature等
class MyModule(tf.Module):
def __init__(self, model):
self.model = model
@tf.function(input_signature=[tf.TensorSpec(shape=None, dtype=tf.float32)])
def output(self, input):
result = self.model(input)
return { "probability": result }
module = MyModule(m)
tf.saved_model.save(module,model_save_path,signatures=module.output)
...

官方文档:https://www.tensorflow.org/api_docs/python/tf/saved_model/save

Keras

  • 使用 model.save API 保存
# 创建Keras模型,以内置的Mobilenet为例
model = tf.keras.applications.MobileNet()
# 保存模型至路径
model.save(model_save_path)

官方文档:https://www.tensorflow.org/guide/keras/save_and_serialize

Oneflow

我们提供一个在 Oneflow 训练代码中保存成 Savedmodel 格式的样例。用户需要在训练的网络结构 TrainNet 基础上额外定义用于推理的 InferenceNet。与保存 savedmodel 相关方法的代码是 make_infer_func 以及 save_to_savedmodel。本demo根据开发中的 Oneflow Serving 模块的 SavedModelBuilderV2 API 开发。

import os
import math
import oneflow as flow
import ofrecord_util
import config as configs
from util import Snapshot, Summary, InitNodes, Metric
from job_function_util import get_train_config, get_val_config
import resnet_model
import resnext_model
import vgg_model
import alexnet_model
import inception_model
import mobilenet_v2_model
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
val_batch_size = total_device_num * args.val_batch_size_per_device
(C, H, W) = args.image_shape
epoch_size = math.ceil(args.num_examples / train_batch_size)
num_val_steps = int(args.num_val_examples / val_batch_size)
model_dict = {
"resnet50": resnet_model.resnet50,
"vgg": vgg_model.vgg16bn,
"alexnet": alexnet_model.alexnet,
"inceptionv3": inception_model.inceptionv3,
"mobilenetv2": mobilenet_v2_model.Mobilenet,
"resnext50": resnext_model.resnext50,
}
flow.config.gpu_device_num(args.gpu_num_per_node)
def label_smoothing(labels, classes, eta, dtype):
assert classes > 0
assert eta >= 0.0 and eta < 1.0
return flow.one_hot(labels, depth=classes, dtype=dtype,
on_value=1 - eta + eta / classes, off_value=eta/classes)
@flow.global_function("train", get_train_config(args))
def TrainNet():
if args.data_url:
assert os.path.exists(args.data_url)
print("Loading data from {}".format(args.data_url))
(labels, images) = ofrecord_util.load_imagenet_for_training(args)
else:
print("Loading synthetic data.")
(labels, images) = ofrecord_util.load_synthetic(args)
logits = model_dict[args.model](images,
need_transpose=False if args.data_url else True,
)
if args.label_smoothing > 0:
one_hot_labels = label_smoothing(labels, args.num_classes, args.label_smoothing, logits.dtype)
loss = flow.nn.softmax_cross_entropy_with_logits(one_hot_labels, logits, name="softmax_loss")
else:
loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
loss = flow.math.reduce_mean(loss)
flow.losses.add_loss(loss)
predictions = flow.nn.softmax(logits)
outputs = {"loss": loss, "predictions": predictions, "labels": labels}
return outputs
def make_infer_func(val_config):
input_lbns = {}
output_lbns = {}
@flow.global_function("predict", val_config)
def InferenceNet(
image: flow.typing.Numpy.Placeholder((1,) + tuple(args.image_shape), dtype=flow.float32)
) -> flow.typing.Numpy:
input_lbns["image"] = image.logical_blob_name
output = model_dict[args.model](image)
output = flow.nn.softmax(output)
output_lbns["output"] = output.logical_blob_name
return output
return InferenceNet, input_lbns, output_lbns
def save_to_savedmodel(checkpoint_path):
resnet_infer, input_lbns, output_lbns = make_infer_func(get_val_config(args))
checkpoint = flow.train.CheckPoint()
checkpoint.load(checkpoint_path)
saved_model_path = os.path.join(args.train_out, args.model)
if os.path.exists(saved_model_path) and os.path.isdir(saved_model_path):
shutil.rmtree(saved_model_path)
model_version = 1
saved_model_builder = flow.SavedModelBuilderV2(saved_model_path)
job_builder = (
saved_model_builder.ModelName(args.model)
.Version(model_version)
.Job(resnet_infer)
)
for input_name, lbn in input_lbns.items():
job_builder.Input(input_name, lbn)
for output_name, lbn in output_lbns.items():
job_builder.Output(output_name, lbn)
job_builder.Complete().Save()
def main():
InitNodes(args)
flow.env.grpc_use_no_signal()
flow.env.log_dir(args.train_log)
summary = Summary(args.train_log, args)
snapshot = Snapshot(args.train_out, args.model_load_dir)
for epoch in range(args.num_epochs):
metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter,
summary=summary, save_summary_steps=epoch_size,
batch_size=train_batch_size, loss_key='loss')
for i in range(epoch_size):
TrainNet().async_get(metric.metric_cb(epoch, i))
snapshot.save('epoch_{}'.format(epoch))
flow.clear_default_session()
# 训练结束,最后一个epoch保存到savedmodel格式文件
checkpoint_path = os.path.join(args.train_out,'snapshot_epoch_{}'.format(args.num_epochs-1))
save_to_savedmodel(checkpoint_path)

Pytorch

Pytorch 保存模型的样例如下:

import torchimport torchvision.models as models
if __name__ == '__main__':
model = models.resnext50_32x4d(pretrained=False)
pre = torch.load("/usr/local/model/pytorch_models/resnext50_32x4d-7cdf4587.pth")
model.load_state_dict(pre)
checkpoint = {'model': models.resnext50_32x4d(),
'state_dict': model.state_dict()}
torch.save(checkpoint, '/usr/local/model/pytorch_models/resnext50.pth')
print("success")