use anyhow::Result; use half::f16; use ndarray::{Array, IxDyn}; use ort::{ ExecutionProvider, ExecutionProviderDispatch, Session, SessionBuilder, TensorElementType, TensorRTExecutionProvider, ValueType, }; use crate::{home_dir, Device, MinOptMax, Options, CHECK_MARK, CROSS_MARK, SAFE_CROSS_MARK}; /// ONNXRuntime Backend #[derive(Debug)] pub struct OrtEngine { session: Session, device: Device, inputs_minoptmax: Vec>, inames: Vec, ishapes: Vec>, idtypes: Vec, onames: Vec, oshapes: Vec>, odtypes: Vec, profile: bool, num_dry_run: usize, } impl OrtEngine { pub fn dry_run(&self) -> Result<()> { if self.num_dry_run == 0 { println!("{SAFE_CROSS_MARK} No dry run count specified, skipping the dry run."); return Ok(()); } let mut xs: Vec> = Vec::new(); for i in self.inputs_minoptmax.iter() { let mut x: Vec = Vec::new(); for i_ in i.iter() { x.push(i_.opt as usize); } let x: Array = Array::ones(x).into_dyn(); xs.push(x); } for _ in 0..self.num_dry_run { self.run(xs.as_ref())?; } println!("{CHECK_MARK} Dry run x{}", self.num_dry_run); Ok(()) } pub fn new(config: &Options) -> Result { ort::init().commit()?; let session = Session::builder()?.with_model_from_file(&config.onnx_path)?; // inputs let mut ishapes = Vec::new(); let mut idtypes = Vec::new(); let mut inames = Vec::new(); for x in session.inputs.iter() { inames.push(x.name.to_owned()); if let ValueType::Tensor { ty, dimensions } = &x.input_type { ishapes.push(dimensions.iter().map(|x| *x as isize).collect::>()); idtypes.push(*ty); } else { ishapes.push(vec![-1_isize]); idtypes.push(ort::TensorElementType::Float32); } } // outputs let mut oshapes = Vec::new(); let mut odtypes = Vec::new(); let mut onames = Vec::new(); for x in session.outputs.iter() { onames.push(x.name.to_owned()); if let ValueType::Tensor { ty, dimensions } = &x.output_type { oshapes.push(dimensions.iter().map(|x| *x as isize).collect::>()); odtypes.push(*ty); } else { oshapes.push(vec![-1_isize]); odtypes.push(ort::TensorElementType::Float32); } } let mut inputs_minoptmax: Vec> = Vec::new(); for (i, dims) in ishapes.iter().enumerate() { let mut v_: Vec = Vec::new(); for (ii, &x) in dims.iter().enumerate() { let x_default: MinOptMax = (ishapes[i][ii], ishapes[i][ii], ishapes[i][ii]).into(); let x: MinOptMax = match (i, ii) { (0, 0) => Self::_set_ixx(x, &config.i00, i, ii).unwrap_or(x_default), (0, 1) => Self::_set_ixx(x, &config.i01, i, ii).unwrap_or(x_default), (0, 2) => Self::_set_ixx(x, &config.i02, i, ii).unwrap_or(x_default), (0, 3) => Self::_set_ixx(x, &config.i03, i, ii).unwrap_or(x_default), (0, 4) => Self::_set_ixx(x, &config.i04, i, ii).unwrap_or(x_default), (0, 5) => Self::_set_ixx(x, &config.i05, i, ii).unwrap_or(x_default), (1, 0) => Self::_set_ixx(x, &config.i10, i, ii).unwrap_or(x_default), (1, 1) => Self::_set_ixx(x, &config.i11, i, ii).unwrap_or(x_default), (1, 2) => Self::_set_ixx(x, &config.i12, i, ii).unwrap_or(x_default), (1, 3) => Self::_set_ixx(x, &config.i13, i, ii).unwrap_or(x_default), (1, 4) => Self::_set_ixx(x, &config.i14, i, ii).unwrap_or(x_default), (1, 5) => Self::_set_ixx(x, &config.i15, i, ii).unwrap_or(x_default), (2, 0) => Self::_set_ixx(x, &config.i20, i, ii).unwrap_or(x_default), (2, 1) => Self::_set_ixx(x, &config.i21, i, ii).unwrap_or(x_default), (2, 2) => Self::_set_ixx(x, &config.i22, i, ii).unwrap_or(x_default), (2, 3) => Self::_set_ixx(x, &config.i23, i, ii).unwrap_or(x_default), (2, 4) => Self::_set_ixx(x, &config.i24, i, ii).unwrap_or(x_default), (2, 5) => Self::_set_ixx(x, &config.i25, i, ii).unwrap_or(x_default), (3, 0) => Self::_set_ixx(x, &config.i30, i, ii).unwrap_or(x_default), (3, 1) => Self::_set_ixx(x, &config.i31, i, ii).unwrap_or(x_default), (3, 2) => Self::_set_ixx(x, &config.i32_, i, ii).unwrap_or(x_default), (3, 3) => Self::_set_ixx(x, &config.i33, i, ii).unwrap_or(x_default), (3, 4) => Self::_set_ixx(x, &config.i34, i, ii).unwrap_or(x_default), (3, 5) => Self::_set_ixx(x, &config.i35, i, ii).unwrap_or(x_default), _ => todo!(), }; v_.push(x); } inputs_minoptmax.push(v_); } // build again let builder = Session::builder()?; let device = config.device.to_owned(); let _ep = match device { Device::Trt(device_id) => Self::build_trt( &inames, &inputs_minoptmax, &builder, device_id, config.trt_int8_enable, config.trt_fp16_enable, config.trt_engine_cache_enable, )?, Device::Cuda(device_id) => Self::build_cuda(&builder, device_id)?, Device::CoreML(_) => { let coreml = ort::CoreMLExecutionProvider::default() .with_subgraphs() // .with_ane_only() .build(); if coreml.is_available()? && coreml.register(&builder).is_ok() { println!("{CHECK_MARK} Using CoreML"); coreml } else { println!("{CROSS_MARK} CoreML initialization failed"); println!("{CHECK_MARK} Using CPU"); ort::CPUExecutionProvider::default().build() } } Device::Cpu(_) => { println!("{CHECK_MARK} Using CPU"); ort::CPUExecutionProvider::default().build() } // _ => todo!(), }; let session = builder .with_optimization_level(ort::GraphOptimizationLevel::Level3)? .with_model_from_file(&config.onnx_path)?; Ok(Self { session, device, inputs_minoptmax, inames, ishapes, idtypes, onames, oshapes, odtypes, profile: config.profile, num_dry_run: config.num_dry_run, }) } fn build_trt( inames: &[String], inputs_minoptmax: &[Vec], builder: &SessionBuilder, device_id: usize, int8_enable: bool, fp16_enable: bool, engine_cache_enable: bool, ) -> Result { // auto generate shapes let mut spec_min = String::new(); let mut spec_opt = String::new(); let mut spec_max = String::new(); for (i, name) in inames.iter().enumerate() { if i != 0 { spec_min.push(','); spec_opt.push(','); spec_max.push(','); } let mut s_min = format!("{}:", name); let mut s_opt = format!("{}:", name); let mut s_max = format!("{}:", name); for d in inputs_minoptmax[i].iter() { let min_ = &format!("{}x", d.min); let opt_ = &format!("{}x", d.opt); let max_ = &format!("{}x", d.max); s_min += min_; s_opt += opt_; s_max += max_; } s_min.pop(); s_opt.pop(); s_max.pop(); spec_min += &s_min; spec_opt += &s_opt; spec_max += &s_max; } let trt = TensorRTExecutionProvider::default() .with_device_id(device_id as i32) .with_int8(int8_enable) .with_fp16(fp16_enable) .with_engine_cache(engine_cache_enable) .with_engine_cache_path(format!( "{}/{}", home_dir(None).to_str().unwrap(), "trt-cache" )) .with_timing_cache(false) .with_profile_min_shapes(spec_min) .with_profile_opt_shapes(spec_opt) .with_profile_max_shapes(spec_max) .build(); if trt.is_available()? && trt.register(builder).is_ok() { println!( "{CHECK_MARK} Using TensorRT (Initial model serialization may require a wait)" ); Ok(trt) } else { println!("{CROSS_MARK} TensorRT initialization failed. Try CUDA..."); Self::build_cuda(builder, device_id) } } fn build_cuda(builder: &SessionBuilder, device_id: usize) -> Result { let cuda = ort::CUDAExecutionProvider::default() .with_device_id(device_id as i32) .build(); if cuda.is_available()? && cuda.register(builder).is_ok() { println!("{CHECK_MARK} Using CUDA"); Ok(cuda) } else { println!("{CROSS_MARK} CUDA initialization failed"); println!("{CHECK_MARK} Using CPU"); Ok(ort::CPUExecutionProvider::default().build()) } } pub fn run(&self, xs: &[Array]) -> Result>> { // input let mut xs_ = Vec::new(); let t_pre = std::time::Instant::now(); for (idtype, x) in self.idtypes.iter().zip(xs.iter()) { let x_ = match idtype { TensorElementType::Float32 => ort::Value::from_array(x.view())?, TensorElementType::Float16 => ort::Value::from_array(x.mapv(f16::from_f32).view())?, TensorElementType::Int32 => ort::Value::from_array(x.mapv(|x_| x_ as i32).view())?, TensorElementType::Int64 => ort::Value::from_array(x.mapv(|x_| x_ as i64).view())?, _ => todo!(), }; xs_.push(x_); } let t_pre = t_pre.elapsed(); // inference let t_run = std::time::Instant::now(); let ys = self.session.run(xs_.as_ref())?; let t_run = t_run.elapsed(); // oputput let mut ys_ = Vec::new(); let t_post = std::time::Instant::now(); for (dtype, name) in self.odtypes.iter().zip(self.onames.iter()) { let y = &ys[name.as_str()]; let y_ = match &dtype { TensorElementType::Float32 => y.extract_tensor::()?.view().to_owned(), TensorElementType::Float16 => y.extract_tensor::()?.view().mapv(f16::to_f32), TensorElementType::Int64 => y .extract_tensor::()? .view() .to_owned() .mapv(|x| x as f32), _ => todo!(), }; ys_.push(y_); } let t_post = t_post.elapsed(); if self.profile { println!( "[Profile] batch: {:?} => {:.4?} (i: {t_pre:.4?}, run: {t_run:.4?}, o: {t_post:.4?})", self.batch().opt, t_pre + t_run + t_post ); } Ok(ys_) } pub fn _set_ixx(x: isize, ixx: &Option, i: usize, ii: usize) -> Option { match x { -1 => { match ixx { None => panic!( "{CROSS_MARK} Using dynamic shapes in inputs without specifying it: the {}-th input, the {}-th dimension.", i + 1, ii + 1 ), Some(ixx) => Some(ixx.to_owned()), // customized } } _ => Some((x, x, x).into()), // customized, but not dynamic } } pub fn oshapes(&self) -> &Vec> { &self.oshapes } pub fn onames(&self) -> &Vec { &self.onames } pub fn odtypes(&self) -> &Vec { &self.odtypes } pub fn ishapes(&self) -> &Vec> { &self.ishapes } pub fn inames(&self) -> &Vec { &self.inames } pub fn idtypes(&self) -> &Vec { &self.idtypes } pub fn device(&self) -> &Device { &self.device } pub fn inputs_minoptmax(&self) -> &Vec> { &self.inputs_minoptmax } pub fn batch(&self) -> &MinOptMax { &self.inputs_minoptmax[0][0] } pub fn height(&self) -> &MinOptMax { &self.inputs_minoptmax[0][2] } pub fn width(&self) -> &MinOptMax { &self.inputs_minoptmax[0][3] } pub fn is_batch_dyn(&self) -> bool { self.ishapes[0][0] == -1 } pub fn try_fetch(&self, key: &str) -> Option { match self.session.metadata() { Err(_) => None, Ok(metadata) => match metadata.custom(key) { Err(_) => None, Ok(value) => value, }, } } pub fn session(&self) -> &Session { &self.session } pub fn version(&self) -> Option { self.try_fetch("version") } }