use anyhow::Result;
use half::f16;
use ndarray::{Array, IxDyn};
use ort::{
    ExecutionProvider, ExecutionProviderDispatch, Session, SessionBuilder, TensorElementType,
    TensorRTExecutionProvider, ValueType,
};

use crate::{home_dir, Device, MinOptMax, Options, CHECK_MARK, CROSS_MARK, SAFE_CROSS_MARK};

/// ONNXRuntime Backend
#[derive(Debug)]
pub struct OrtEngine {
    session: Session,
    device: Device,
    inputs_minoptmax: Vec<Vec<MinOptMax>>,
    inames: Vec<String>,
    ishapes: Vec<Vec<isize>>,
    idtypes: Vec<TensorElementType>,
    onames: Vec<String>,
    oshapes: Vec<Vec<isize>>,
    odtypes: Vec<TensorElementType>,
    profile: bool,
    num_dry_run: usize,
}

impl OrtEngine {
    pub fn dry_run(&self) -> Result<()> {
        if self.num_dry_run == 0 {
            println!("{SAFE_CROSS_MARK} No dry run count specified, skipping the dry run.");
            return Ok(());
        }
        let mut xs: Vec<Array<f32, IxDyn>> = Vec::new();
        for i in self.inputs_minoptmax.iter() {
            let mut x: Vec<usize> = Vec::new();
            for i_ in i.iter() {
                x.push(i_.opt as usize);
            }
            let x: Array<f32, IxDyn> = Array::ones(x).into_dyn();
            xs.push(x);
        }
        for _ in 0..self.num_dry_run {
            self.run(xs.as_ref())?;
        }
        println!("{CHECK_MARK} Dry run x{}", self.num_dry_run);
        Ok(())
    }

    pub fn new(config: &Options) -> Result<Self> {
        ort::init().commit()?;
        let session = Session::builder()?.with_model_from_file(&config.onnx_path)?;

        // inputs
        let mut ishapes = Vec::new();
        let mut idtypes = Vec::new();
        let mut inames = Vec::new();
        for x in session.inputs.iter() {
            inames.push(x.name.to_owned());
            if let ValueType::Tensor { ty, dimensions } = &x.input_type {
                ishapes.push(dimensions.iter().map(|x| *x as isize).collect::<Vec<_>>());
                idtypes.push(*ty);
            } else {
                ishapes.push(vec![-1_isize]);
                idtypes.push(ort::TensorElementType::Float32);
            }
        }
        // outputs
        let mut oshapes = Vec::new();
        let mut odtypes = Vec::new();
        let mut onames = Vec::new();
        for x in session.outputs.iter() {
            onames.push(x.name.to_owned());
            if let ValueType::Tensor { ty, dimensions } = &x.output_type {
                oshapes.push(dimensions.iter().map(|x| *x as isize).collect::<Vec<_>>());
                odtypes.push(*ty);
            } else {
                oshapes.push(vec![-1_isize]);
                odtypes.push(ort::TensorElementType::Float32);
            }
        }
        let mut inputs_minoptmax: Vec<Vec<MinOptMax>> = Vec::new();
        for (i, dims) in ishapes.iter().enumerate() {
            let mut v_: Vec<MinOptMax> = Vec::new();
            for (ii, &x) in dims.iter().enumerate() {
                let x_default: MinOptMax = (ishapes[i][ii], ishapes[i][ii], ishapes[i][ii]).into();
                let x: MinOptMax = match (i, ii) {
                    (0, 0) => Self::_set_ixx(x, &config.i00, i, ii).unwrap_or(x_default),
                    (0, 1) => Self::_set_ixx(x, &config.i01, i, ii).unwrap_or(x_default),
                    (0, 2) => Self::_set_ixx(x, &config.i02, i, ii).unwrap_or(x_default),
                    (0, 3) => Self::_set_ixx(x, &config.i03, i, ii).unwrap_or(x_default),
                    (0, 4) => Self::_set_ixx(x, &config.i04, i, ii).unwrap_or(x_default),
                    (0, 5) => Self::_set_ixx(x, &config.i05, i, ii).unwrap_or(x_default),
                    (1, 0) => Self::_set_ixx(x, &config.i10, i, ii).unwrap_or(x_default),
                    (1, 1) => Self::_set_ixx(x, &config.i11, i, ii).unwrap_or(x_default),
                    (1, 2) => Self::_set_ixx(x, &config.i12, i, ii).unwrap_or(x_default),
                    (1, 3) => Self::_set_ixx(x, &config.i13, i, ii).unwrap_or(x_default),
                    (1, 4) => Self::_set_ixx(x, &config.i14, i, ii).unwrap_or(x_default),
                    (1, 5) => Self::_set_ixx(x, &config.i15, i, ii).unwrap_or(x_default),
                    (2, 0) => Self::_set_ixx(x, &config.i20, i, ii).unwrap_or(x_default),
                    (2, 1) => Self::_set_ixx(x, &config.i21, i, ii).unwrap_or(x_default),
                    (2, 2) => Self::_set_ixx(x, &config.i22, i, ii).unwrap_or(x_default),
                    (2, 3) => Self::_set_ixx(x, &config.i23, i, ii).unwrap_or(x_default),
                    (2, 4) => Self::_set_ixx(x, &config.i24, i, ii).unwrap_or(x_default),
                    (2, 5) => Self::_set_ixx(x, &config.i25, i, ii).unwrap_or(x_default),
                    (3, 0) => Self::_set_ixx(x, &config.i30, i, ii).unwrap_or(x_default),
                    (3, 1) => Self::_set_ixx(x, &config.i31, i, ii).unwrap_or(x_default),
                    (3, 2) => Self::_set_ixx(x, &config.i32_, i, ii).unwrap_or(x_default),
                    (3, 3) => Self::_set_ixx(x, &config.i33, i, ii).unwrap_or(x_default),
                    (3, 4) => Self::_set_ixx(x, &config.i34, i, ii).unwrap_or(x_default),
                    (3, 5) => Self::_set_ixx(x, &config.i35, i, ii).unwrap_or(x_default),
                    _ => todo!(),
                };
                v_.push(x);
            }
            inputs_minoptmax.push(v_);
        }

        // build again
        let builder = Session::builder()?;
        let device = config.device.to_owned();
        let _ep = match device {
            Device::Trt(device_id) => Self::build_trt(
                &inames,
                &inputs_minoptmax,
                &builder,
                device_id,
                config.trt_int8_enable,
                config.trt_fp16_enable,
                config.trt_engine_cache_enable,
            )?,
            Device::Cuda(device_id) => Self::build_cuda(&builder, device_id)?,
            Device::CoreML(_) => {
                let coreml = ort::CoreMLExecutionProvider::default()
                    .with_subgraphs()
                    // .with_ane_only()
                    .build();
                if coreml.is_available()? && coreml.register(&builder).is_ok() {
                    println!("{CHECK_MARK} Using CoreML");
                    coreml
                } else {
                    println!("{CROSS_MARK} CoreML initialization failed");
                    println!("{CHECK_MARK} Using CPU");
                    ort::CPUExecutionProvider::default().build()
                }
            }
            Device::Cpu(_) => {
                println!("{CHECK_MARK} Using CPU");
                ort::CPUExecutionProvider::default().build()
            } // _ => todo!(),
        };
        let session = builder
            .with_optimization_level(ort::GraphOptimizationLevel::Level3)?
            .with_model_from_file(&config.onnx_path)?;

        Ok(Self {
            session,
            device,
            inputs_minoptmax,
            inames,
            ishapes,
            idtypes,
            onames,
            oshapes,
            odtypes,
            profile: config.profile,
            num_dry_run: config.num_dry_run,
        })
    }

    fn build_trt(
        inames: &[String],
        inputs_minoptmax: &[Vec<MinOptMax>],
        builder: &SessionBuilder,
        device_id: usize,
        int8_enable: bool,
        fp16_enable: bool,
        engine_cache_enable: bool,
    ) -> Result<ExecutionProviderDispatch> {
        // auto generate shapes
        let mut spec_min = String::new();
        let mut spec_opt = String::new();
        let mut spec_max = String::new();
        for (i, name) in inames.iter().enumerate() {
            if i != 0 {
                spec_min.push(',');
                spec_opt.push(',');
                spec_max.push(',');
            }
            let mut s_min = format!("{}:", name);
            let mut s_opt = format!("{}:", name);
            let mut s_max = format!("{}:", name);
            for d in inputs_minoptmax[i].iter() {
                let min_ = &format!("{}x", d.min);
                let opt_ = &format!("{}x", d.opt);
                let max_ = &format!("{}x", d.max);
                s_min += min_;
                s_opt += opt_;
                s_max += max_;
            }
            s_min.pop();
            s_opt.pop();
            s_max.pop();
            spec_min += &s_min;
            spec_opt += &s_opt;
            spec_max += &s_max;
        }
        let trt = TensorRTExecutionProvider::default()
            .with_device_id(device_id as i32)
            .with_int8(int8_enable)
            .with_fp16(fp16_enable)
            .with_engine_cache(engine_cache_enable)
            .with_engine_cache_path(format!(
                "{}/{}",
                home_dir(None).to_str().unwrap(),
                "trt-cache"
            ))
            .with_timing_cache(false)
            .with_profile_min_shapes(spec_min)
            .with_profile_opt_shapes(spec_opt)
            .with_profile_max_shapes(spec_max)
            .build();
        if trt.is_available()? && trt.register(builder).is_ok() {
            println!(
                "{CHECK_MARK} Using TensorRT (Initial model serialization may require a wait)"
            );
            Ok(trt)
        } else {
            println!("{CROSS_MARK} TensorRT initialization failed. Try CUDA...");
            Self::build_cuda(builder, device_id)
        }
    }

    fn build_cuda(builder: &SessionBuilder, device_id: usize) -> Result<ExecutionProviderDispatch> {
        let cuda = ort::CUDAExecutionProvider::default()
            .with_device_id(device_id as i32)
            .build();
        if cuda.is_available()? && cuda.register(builder).is_ok() {
            println!("{CHECK_MARK} Using CUDA");
            Ok(cuda)
        } else {
            println!("{CROSS_MARK} CUDA initialization failed");
            println!("{CHECK_MARK} Using CPU");
            Ok(ort::CPUExecutionProvider::default().build())
        }
    }

    pub fn run(&self, xs: &[Array<f32, IxDyn>]) -> Result<Vec<Array<f32, IxDyn>>> {
        // input
        let mut xs_ = Vec::new();
        let t_pre = std::time::Instant::now();
        for (idtype, x) in self.idtypes.iter().zip(xs.iter()) {
            let x_ = match idtype {
                TensorElementType::Float32 => ort::Value::from_array(x.view())?,
                TensorElementType::Float16 => ort::Value::from_array(x.mapv(f16::from_f32).view())?,
                TensorElementType::Int32 => ort::Value::from_array(x.mapv(|x_| x_ as i32).view())?,
                TensorElementType::Int64 => ort::Value::from_array(x.mapv(|x_| x_ as i64).view())?,
                _ => todo!(),
            };
            xs_.push(x_);
        }
        let t_pre = t_pre.elapsed();

        // inference
        let t_run = std::time::Instant::now();
        let ys = self.session.run(xs_.as_ref())?;
        let t_run = t_run.elapsed();

        // oputput
        let mut ys_ = Vec::new();
        let t_post = std::time::Instant::now();

        for (dtype, name) in self.odtypes.iter().zip(self.onames.iter()) {
            let y = &ys[name.as_str()];
            let y_ = match &dtype {
                TensorElementType::Float32 => y.extract_tensor::<f32>()?.view().to_owned(),
                TensorElementType::Float16 => y.extract_tensor::<f16>()?.view().mapv(f16::to_f32),
                TensorElementType::Int64 => y
                    .extract_tensor::<i64>()?
                    .view()
                    .to_owned()
                    .mapv(|x| x as f32),
                _ => todo!(),
            };
            ys_.push(y_);
        }
        let t_post = t_post.elapsed();
        if self.profile {
            println!(
                "[Profile] batch: {:?} => {:.4?} (i: {t_pre:.4?}, run: {t_run:.4?}, o: {t_post:.4?})", 
                self.batch().opt,
                t_pre + t_run + t_post
            );
        }
        Ok(ys_)
    }

    pub fn _set_ixx(x: isize, ixx: &Option<MinOptMax>, i: usize, ii: usize) -> Option<MinOptMax> {
        match x {
            -1 => {
                match ixx {
                    None => panic!(
                        "{CROSS_MARK} Using dynamic shapes in inputs without specifying it: the {}-th input, the {}-th dimension.",
                        i + 1,
                        ii + 1
                    ),
                    Some(ixx) => Some(ixx.to_owned()), // customized
                }
            }
            _ => Some((x, x, x).into()), // customized, but not dynamic
        }
    }

    pub fn oshapes(&self) -> &Vec<Vec<isize>> {
        &self.oshapes
    }

    pub fn onames(&self) -> &Vec<String> {
        &self.onames
    }

    pub fn odtypes(&self) -> &Vec<ort::TensorElementType> {
        &self.odtypes
    }

    pub fn ishapes(&self) -> &Vec<Vec<isize>> {
        &self.ishapes
    }

    pub fn inames(&self) -> &Vec<String> {
        &self.inames
    }

    pub fn idtypes(&self) -> &Vec<ort::TensorElementType> {
        &self.idtypes
    }

    pub fn device(&self) -> &Device {
        &self.device
    }

    pub fn inputs_minoptmax(&self) -> &Vec<Vec<MinOptMax>> {
        &self.inputs_minoptmax
    }

    pub fn batch(&self) -> &MinOptMax {
        &self.inputs_minoptmax[0][0]
    }

    pub fn height(&self) -> &MinOptMax {
        &self.inputs_minoptmax[0][2]
    }

    pub fn width(&self) -> &MinOptMax {
        &self.inputs_minoptmax[0][3]
    }

    pub fn is_batch_dyn(&self) -> bool {
        self.ishapes[0][0] == -1
    }

    pub fn try_fetch(&self, key: &str) -> Option<String> {
        match self.session.metadata() {
            Err(_) => None,
            Ok(metadata) => match metadata.custom(key) {
                Err(_) => None,
                Ok(value) => value,
            },
        }
    }

    pub fn session(&self) -> &Session {
        &self.session
    }

    pub fn version(&self) -> Option<String> {
        self.try_fetch("version")
    }
}