#[derive(Debug, Clone, Ord, Eq, PartialOrd, PartialEq)] pub enum Task { Untitled, /// Image classification task. /// Input: image /// Output: a label representing the class of the image ImageClassification, /// Multi-label image tagging task. /// Input: image /// Output: multiple labels representing different categories in the image ImageTagging, /// Image captioning task, generating descriptions with different levels of detail. /// Input: image /// Output: a text description, `u8` represents the level of detail: /// 0 for brief, 1 for detailed, 2 for more detailed Caption(u8), /// Region proposal task, detecting all objects in the image. /// Input: image /// Output: bounding boxes (bboxes) RegionProposal, /// Object detection task, detecting all objects in the image. /// Input: image /// Output: bounding boxes (bboxes), class labels, and optional scores for the detected objects ObjectDetection, /// Open set detection task, detecting and classifying objects in an image, with the ability to handle unseen or unknown objects. /// Input: image /// Output: bounding boxes, class labels (including an "unknown" category for unfamiliar objects), and detection scores /// Open set detection task, with String query OpenSetDetection(String), /// Task for generating brief descriptions of dense regions in the image. /// Input: image /// Output: bounding boxes (bboxes), brief phrase labels, and optional scores for detected regions DenseRegionCaption, /// Keypoint detection task, detecting keypoints in an image. /// This can include human body parts (e.g., hands, feet, joints) or other objects. /// Input: image /// Output: coordinates of detected keypoints KeypointsDetection, /// Semantic segmentation task, segmenting the image into different semantic regions. /// Input: image /// Output: per-pixel class labels indicating object or background SemanticSegmentation, /// Instance segmentation task, detecting and segmenting individual object instances. /// Input: image /// Output: pixel masks for each object instance InstanceSegmentation, /// Depth estimation task, predicting the distance of each pixel from the camera. /// Input: image /// Output: a depth map where each pixel has a depth value DepthEstimation, /// Surface normal prediction task, predicting the surface normal vector for each pixel. /// Input: image /// Output: a normal map where each pixel has a surface normal vector SurfaceNormalPrediction, /// Image-to-image generation task, transforming one image into another. /// Input: image /// Output: a generated image ImageToImageGeneration, /// Text-to-image generation task, generating an image based on a text description. /// Input: text /// Output: a generated image TextToImageGeneration, /// Inpainting task, filling in missing or corrupted parts of an image. /// Input: image with missing or corrupted regions /// Output: a complete image with the missing parts filled in Inpainting, /// Super-resolution task, enhancing the resolution of an image. /// Input: low-resolution image /// Output: high-resolution image SuperResolution, /// Image denoising task, removing noise from an image. /// Input: noisy image /// Output: denoised image Denoising, /// Phrase grounding task, finding the region in an image corresponding to a text description. /// Input: image and text /// Output: image region and the corresponding phrase /// caption to phrase grounding CaptionToPhraseGrounding(String), /// Referring expression segmentation task, segmenting objects in the image based on a text description. /// Input: image and referring expression /// Output: a segmentation mask for the object referred to by the text ReferringExpressionSegmentation(String), /// Region-to-segmentation task, similar to combining object detection with segmentation (e.g., YOLO + SAM). /// Input: image and region proposals /// Output: segmentation masks for the regions /// Region, bbox: top-left, bottom-right RegionToSegmentation(usize, usize, usize, usize), /// Region-to-category classification task, classifying the object in a given region of the image. /// Input: image and region /// Output: class label for the region /// Region, bbox: top-left, bottom-right RegionToCategory(usize, usize, usize, usize), /// Region-to-description task, generating a detailed description for a given region in the image. /// Input: image and region /// Output: a detailed textual description for the region /// Region, bbox: top-left, bottom-right RegionToDescription(usize, usize, usize, usize), /// Visual question answering (VQA) task, answering questions related to an image. /// Input: image and question text /// Output: the answer to the question Vqa(String), /// Optical character recognition (OCR) task, recognizing text in an image. /// Input: image /// Output: recognized text Ocr, /// OCR task with region information, recognizing text and returning its location in the image. /// Input: image /// Output: recognized text and its bounding box in the image OcrWithRegion, } impl Task { pub fn prompt_for_florence2(&self) -> anyhow::Result { let prompt = match self { Self::Untitled => anyhow::bail!("No task specified."), Self::Caption(0) => "What does the image describe?".to_string(), Self::Caption(1) => "Describe in detail what is shown in the image.".to_string(), Self::Caption(2) => "Describe with a paragraph what is shown in the image.".to_string(), Self::Ocr => "What is the text in the image?".to_string(), Self::OcrWithRegion => "What is the text in the image, with regions?".to_string(), Self::ObjectDetection => { "Locate the objects with category name in the image.".to_string() } Self::DenseRegionCaption => { "Locate the objects in the image, with their descriptions.".to_string() } Self::RegionProposal => "Locate the region proposals in the image.".to_string(), Self::OpenSetDetection(text) => { format!("Locate {} in the image.", text) } Self::CaptionToPhraseGrounding(text) => { format!("Locate the phrases in the caption: {}", text) } Self::ReferringExpressionSegmentation(text) => { format!("Locate {} in the image with mask", text) } Self::RegionToSegmentation(x0, y0, x1, y1) => { format!( "What is the polygon mask of region ", x0, y0, x1, y1 ) } Self::RegionToCategory(x0, y0, x1, y1) => { format!( "What is the region ?", x0, y0, x1, y1 ) } Self::RegionToDescription(x0, y0, x1, y1) => { format!( "What does the region describe?", x0, y0, x1, y1 ) } _ => anyhow::bail!("Unsupported task."), }; Ok(prompt) } }