Skip to content

Python API Reference

Auto-generated API documentation for all Python modules in the python/modules/ package.


modules.model

The main high-level module for running RF-DETR inference.

Detection dataclass

Source code in python/modules/model.py
@dataclass
class Detection:
    score: float
    label: int
    normalized_box: np.ndarray  # [x, y, w, h] normalized
    unnormalized_box: np.ndarray  # [x, y, w, h] in pixels
    mask: Optional[np.ndarray] = None

RFDETRModel

High-level class for RF-DETR model inference.

Source code in python/modules/model.py
class RFDETRModel:
    """High-level class for RF-DETR model inference."""

    MEANS = [0.485, 0.456, 0.406]
    STDS = [0.229, 0.224, 0.225]

    def __init__(self, model_path: str, device: str = "gpu"):
        """
        Initialize the RF-DETR model.

        Args:
            model_path (str): Path to the ONNX model file.
            device (str): Device preference ("gpu" or "cpu").
        """
        self.ort_session_ = OnnxRuntimeSession(model_path, device=device)
        input_shape = self.ort_session_.get_input_shape()
        self.input_height, self.input_width = input_shape[2:]

        # Pre-convert normalization constants for speed
        self.means = np.array(self.MEANS, dtype=np.float32).reshape(3, 1, 1)
        self.stds = np.array(self.STDS, dtype=np.float32).reshape(3, 1, 1)

    def _preprocess(self, image: np.ndarray) -> np.ndarray:
        """
        Preprocess the input image for inference.

        Args:
            image (np.ndarray): Input image (H, W, C) in BGR format.

        Returns:
            np.ndarray: Preprocessed image batch (1, C, H, W).
        """
        # Convert BGR (OpenCV) to RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        # Resize the image to the model's input size
        image = cv2.resize(image, (self.input_width, self.input_height))

        # Convert image to float32 and normalize pixel values
        image = image.astype(np.float32) / 255.0

        # Change dimensions from HWC to CHW before normalization
        image = np.transpose(image, (2, 0, 1))

        # Normalize (vectorized)
        image = (image - self.means) / self.stds

        # Add batch dimension
        image = np.expand_dims(image.astype(np.float32), axis=0)

        return image

    def _post_process(
        self, 
        outputs: list[np.ndarray], 
        origin_height: int, 
        origin_width: int, 
        confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD, 
        max_number_boxes: int = DEFAULT_MAX_NUMBER_BOXES
    ) -> list[Detection]:
        """
        Post-process the model's output to extract bounding boxes and class information.
        Inspired by the PostProcess class in rfdetr/lwdetr.py: https://github.com/roboflow/rf-detr/blob/1.3.0/rfdetr/models/lwdetr.py#L701

        Args:
            outputs (list[np.ndarray]): Raw model outputs.
            origin_height (int): Original image height.
            origin_width (int): Original image width.
            confidence_threshold (float): Confidence threshold for filtering.
            max_number_boxes (int): Maximum number of boxes to return.

        Returns:
            list[Detection]: A list of Detection objects.
        """
        # Get masks if instance segmentation
        if len(outputs) == 3:  
            masks = outputs[2]
        else:
            masks = None

        # Apply sigmoid activation
        prob = sigmoid(outputs[1]) 

        # Get detections with highest confidence and limit to max_number_boxes
        scores = np.max(prob, axis=2).squeeze()
        labels = np.argmax(prob, axis=2).squeeze()
        sorted_idx = np.argsort(scores)[::-1]
        scores = scores[sorted_idx][:max_number_boxes]
        labels = labels[sorted_idx][:max_number_boxes]
        boxes = outputs[0].squeeze()[sorted_idx][:max_number_boxes]
        if masks is not None:
            masks = masks.squeeze()[sorted_idx][:max_number_boxes]

        # Filter detections based on the confidence threshold
        confidence_mask = scores > confidence_threshold
        scores = scores[confidence_mask]
        labels = labels[confidence_mask]
        boxes = boxes[confidence_mask]
        if masks is not None:
            masks = masks[confidence_mask]

        # Convert boxes from cxcywh to xywh format (normalized)
        norm_boxes = box_cxcywh_to_xywh(boxes)

        # Calculate unnormalized boxes
        unnorm_boxes = norm_boxes.copy()
        unnorm_boxes[..., [0, 2]] *= origin_width
        unnorm_boxes[..., [1, 3]] *= origin_height

        # Resize the masks to the original image size if available
        processed_masks = []
        if masks is not None:
            for i in range(len(masks)):
                m = cv2.resize(masks[i], (origin_width, origin_height))
                m = (m > 0).astype(np.uint8) * 255
                processed_masks.append(m)

        # Create list of Detection objects
        detections = []
        for i in range(len(scores)):
            mask = processed_masks[i] if processed_masks else None
            detections.append(Detection(
                score=float(scores[i]),
                label=int(labels[i]),
                normalized_box=norm_boxes[i],
                unnormalized_box=unnorm_boxes[i],
                mask=mask
            ))

        return detections

    def predict(
        self, 
        image: Union[np.ndarray, Image.Image], 
        confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD, 
        max_number_boxes: int = DEFAULT_MAX_NUMBER_BOXES
    ) -> tuple[list[Detection], dict[str, float]]:
        """
        Predict bounding boxes and masks for a single image.

        Args:
            image: Input image (OpenCV format BGR or PIL Image).
            confidence_threshold: Confidence threshold for filtering boxes.
            max_number_boxes: Maximum number of boxes to return.

        Returns:
            A tuple of (detections, timings).
        """
        start_total = time.perf_counter()

        # 0. Convert PIL image to OpenCV context if necessary
        if isinstance(image, Image.Image):
            image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

        origin_height, origin_width = image.shape[:2]

        # 1. Pre-process
        start_pre = time.perf_counter()
        input_tensor = self._preprocess(image)
        end_pre = time.perf_counter()

        # 2. Inference
        start_run = time.perf_counter()
        outputs = self.ort_session_.run(input_tensor)
        end_run = time.perf_counter()

        # 3. Post-process
        start_post = time.perf_counter()
        detections = self._post_process(
            outputs, 
            origin_height, 
            origin_width, 
            confidence_threshold, 
            max_number_boxes
        )
        end_post = time.perf_counter()

        end_total = time.perf_counter()

        timings = {
            "preprocess": (end_pre - start_pre) * 1000,
            "ort_run": (end_run - start_run) * 1000,
            "postprocess": (end_post - start_post) * 1000,
            "total": (end_total - start_total) * 1000
        }

        return detections, timings

    def save_detections(
        self, 
        image: np.ndarray, 
        detections: list[Detection], 
        save_image_path: str
    ) -> None:
        """
        Draw bounding boxes, masks and class labels on the original image and save it.

        Args:
            image (np.ndarray): Original image (BGR).
            detections (list[Detection]): List of Detection objects.
            save_image_path (str): Path to save the result.
        """
        result = image.copy()
        overlay = image.copy()

        # Generate a color for each unique label (BGR)
        unique_labels = {det.label for det in detections}
        label_colors = {
            label: (random.randint(0, 255),
                    random.randint(0, 255),
                    random.randint(0, 255))
            for label in unique_labels
        }

        # Draw masks on the overlay
        for det in detections:
            if det.mask is not None:
                color = label_colors[det.label]
                mask_bool = det.mask > 0
                overlay[mask_bool] = color

        # Blend the overlay with the original image
        alpha = 0.5
        cv2.addWeighted(overlay, alpha, result, 1 - alpha, 0, result)

        # Draw boxes and labels on the result
        for det in detections:
            label = det.label
            color = label_colors[label]
            box = det.unnormalized_box

            # box is [x, y, w, h] float or int, convert to int for cv2
            x, y, w, h = int(box[0]), int(box[1]), int(box[2]), int(box[3])

            # Draw bounding box
            cv2.rectangle(result, (x, y), (x + w, y + h), color, 4)

            # Draw label text background
            text = str(label)
            font = cv2.FONT_HERSHEY_SIMPLEX
            font_scale = 0.5
            thickness = 1

            (text_width, text_height), baseline = cv2.getTextSize(text, font, font_scale, thickness)

            text_x = x + 5
            text_y = y + 5 + text_height

            cv2.rectangle(result, (text_x, text_y - text_height - 5), (text_x + text_width, text_y + 5), color, -1)

            # Draw label text
            text_color = (255, 255, 255)
            cv2.putText(result, text, (text_x, text_y), font, font_scale, text_color, thickness, cv2.LINE_AA)

        # Save
        cv2.imwrite(save_image_path, result)

__init__(model_path, device='gpu')

Initialize the RF-DETR model.

Parameters:

Name Type Description Default
model_path str

Path to the ONNX model file.

required
device str

Device preference ("gpu" or "cpu").

'gpu'
Source code in python/modules/model.py
def __init__(self, model_path: str, device: str = "gpu"):
    """
    Initialize the RF-DETR model.

    Args:
        model_path (str): Path to the ONNX model file.
        device (str): Device preference ("gpu" or "cpu").
    """
    self.ort_session_ = OnnxRuntimeSession(model_path, device=device)
    input_shape = self.ort_session_.get_input_shape()
    self.input_height, self.input_width = input_shape[2:]

    # Pre-convert normalization constants for speed
    self.means = np.array(self.MEANS, dtype=np.float32).reshape(3, 1, 1)
    self.stds = np.array(self.STDS, dtype=np.float32).reshape(3, 1, 1)

predict(image, confidence_threshold=DEFAULT_CONFIDENCE_THRESHOLD, max_number_boxes=DEFAULT_MAX_NUMBER_BOXES)

Predict bounding boxes and masks for a single image.

Parameters:

Name Type Description Default
image Union[ndarray, Image]

Input image (OpenCV format BGR or PIL Image).

required
confidence_threshold float

Confidence threshold for filtering boxes.

DEFAULT_CONFIDENCE_THRESHOLD
max_number_boxes int

Maximum number of boxes to return.

DEFAULT_MAX_NUMBER_BOXES

Returns:

Type Description
tuple[list[Detection], dict[str, float]]

A tuple of (detections, timings).

Source code in python/modules/model.py
def predict(
    self, 
    image: Union[np.ndarray, Image.Image], 
    confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD, 
    max_number_boxes: int = DEFAULT_MAX_NUMBER_BOXES
) -> tuple[list[Detection], dict[str, float]]:
    """
    Predict bounding boxes and masks for a single image.

    Args:
        image: Input image (OpenCV format BGR or PIL Image).
        confidence_threshold: Confidence threshold for filtering boxes.
        max_number_boxes: Maximum number of boxes to return.

    Returns:
        A tuple of (detections, timings).
    """
    start_total = time.perf_counter()

    # 0. Convert PIL image to OpenCV context if necessary
    if isinstance(image, Image.Image):
        image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

    origin_height, origin_width = image.shape[:2]

    # 1. Pre-process
    start_pre = time.perf_counter()
    input_tensor = self._preprocess(image)
    end_pre = time.perf_counter()

    # 2. Inference
    start_run = time.perf_counter()
    outputs = self.ort_session_.run(input_tensor)
    end_run = time.perf_counter()

    # 3. Post-process
    start_post = time.perf_counter()
    detections = self._post_process(
        outputs, 
        origin_height, 
        origin_width, 
        confidence_threshold, 
        max_number_boxes
    )
    end_post = time.perf_counter()

    end_total = time.perf_counter()

    timings = {
        "preprocess": (end_pre - start_pre) * 1000,
        "ort_run": (end_run - start_run) * 1000,
        "postprocess": (end_post - start_post) * 1000,
        "total": (end_total - start_total) * 1000
    }

    return detections, timings

save_detections(image, detections, save_image_path)

Draw bounding boxes, masks and class labels on the original image and save it.

Parameters:

Name Type Description Default
image ndarray

Original image (BGR).

required
detections list[Detection]

List of Detection objects.

required
save_image_path str

Path to save the result.

required
Source code in python/modules/model.py
def save_detections(
    self, 
    image: np.ndarray, 
    detections: list[Detection], 
    save_image_path: str
) -> None:
    """
    Draw bounding boxes, masks and class labels on the original image and save it.

    Args:
        image (np.ndarray): Original image (BGR).
        detections (list[Detection]): List of Detection objects.
        save_image_path (str): Path to save the result.
    """
    result = image.copy()
    overlay = image.copy()

    # Generate a color for each unique label (BGR)
    unique_labels = {det.label for det in detections}
    label_colors = {
        label: (random.randint(0, 255),
                random.randint(0, 255),
                random.randint(0, 255))
        for label in unique_labels
    }

    # Draw masks on the overlay
    for det in detections:
        if det.mask is not None:
            color = label_colors[det.label]
            mask_bool = det.mask > 0
            overlay[mask_bool] = color

    # Blend the overlay with the original image
    alpha = 0.5
    cv2.addWeighted(overlay, alpha, result, 1 - alpha, 0, result)

    # Draw boxes and labels on the result
    for det in detections:
        label = det.label
        color = label_colors[label]
        box = det.unnormalized_box

        # box is [x, y, w, h] float or int, convert to int for cv2
        x, y, w, h = int(box[0]), int(box[1]), int(box[2]), int(box[3])

        # Draw bounding box
        cv2.rectangle(result, (x, y), (x + w, y + h), color, 4)

        # Draw label text background
        text = str(label)
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.5
        thickness = 1

        (text_width, text_height), baseline = cv2.getTextSize(text, font, font_scale, thickness)

        text_x = x + 5
        text_y = y + 5 + text_height

        cv2.rectangle(result, (text_x, text_y - text_height - 5), (text_x + text_width, text_y + 5), color, -1)

        # Draw label text
        text_color = (255, 255, 255)
        cv2.putText(result, text, (text_x, text_y), font, font_scale, text_color, thickness, cv2.LINE_AA)

    # Save
    cv2.imwrite(save_image_path, result)

modules.onnx_runtime

Low-level ONNX Runtime session wrapper.

OnnxRuntimeSession

Wrapper class for ONNX Runtime session.

Source code in python/modules/onnx_runtime.py
class OnnxRuntimeSession:
    """Wrapper class for ONNX Runtime session."""

    def __init__(self, model_path: str, device: str = "gpu"):
        """
        Initialize the ONNX Runtime session with the best available provider for the chosen device.

        Args:
            model_path (str): Path to the ONNX model file.
            device (str): Device preference ("gpu" or "cpu").
        """
        try:
            providers = self._get_best_providers(device)
            sess_options = ort.SessionOptions()
            # ORT_ENABLE_ALL allows the optimizer to convert FP16 nodes to FP32
            # when running on CPU (which does not natively support float16 tensors).
            sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
            self.session = ort.InferenceSession(
                model_path, sess_options=sess_options, providers=providers
            )
            self.input_info = self.session.get_inputs()[0]
            self.input_name = self.input_info.name
            self.input_shape = self.input_info.shape
            self.input_dtype = self._ort_type_to_numpy_dtype(self.input_info.type)

            print("Input metadata:")
            print(f"  - Name:  {self.input_name}")
            print(f"  - Shape: {self.input_shape}")
            print(f"  - Type:  {self.input_info.type} -> numpy: {self.input_dtype}")

            active_providers = self.session.get_providers()
            print(f"--- ONNX Runtime: Using {active_providers[0]} for inference ---")

            outputs = self.session.get_outputs()
            print(f"Output metadata ({len(outputs)} outputs):")
            for i, output in enumerate(outputs):
                print(f"  Output {i}:")
                print(f"    - Name:  {output.name}")
                print(f"    - Shape: {output.shape}")
                print(f"    - Type:  {output.type}")

            # Perform a warmup run to initialize CUDA/TensorRT
            if "TensorrtExecutionProvider" in active_providers or "CUDAExecutionProvider" in active_providers:
                print("--- ONNX Runtime: Warming up GPU... ---")
                dummy_input = np.zeros(self.input_shape, dtype=self.input_dtype)
                self.session.run(None, {self.input_name: dummy_input})
                print("--- ONNX Runtime: Warmup complete ---")
        except Exception as e:
            print(f"ERROR: Failed to load model '{model_path}' on {device}:")
            print(f"  {str(e)}")
            raise

    @staticmethod
    def _ort_type_to_numpy_dtype(ort_type: str) -> np.dtype:
        """
        Map an ONNX Runtime type string (e.g. 'tensor(float16)') to a numpy dtype.

        Args:
            ort_type (str): The type string returned by NodeArg.type.

        Returns:
            np.dtype: Corresponding numpy dtype (defaults to float32 for unknown types).
        """
        mapping = {
            "tensor(float16)": np.float16,
            "tensor(float)": np.float32,
            "tensor(double)": np.float64,
            "tensor(int8)": np.int8,
            "tensor(int16)": np.int16,
            "tensor(int32)": np.int32,
            "tensor(int64)": np.int64,
            "tensor(uint8)": np.uint8,
        }
        return np.dtype(mapping.get(ort_type, np.float32))

    def _get_best_providers(self, device: str = "gpu") -> list[str]:
        """
        Determine the best available execution providers based on device preference.

        Args:
            device (str): "gpu" (TensorRT > CUDA > CPU) or "cpu" (CPU only).
        """
        available = ort.get_available_providers()
        providers = []

        if device.lower() == "gpu":
            if "TensorrtExecutionProvider" in available:
                providers.append("TensorrtExecutionProvider")
            if "CUDAExecutionProvider" in available:
                providers.append("CUDAExecutionProvider")

        providers.append("CPUExecutionProvider")
        return providers

    def run(self, input_data: np.ndarray) -> list[np.ndarray]:
        """Run inference with the provided input data, casting to the model's expected dtype."""
        if input_data.dtype != self.input_dtype:
            input_data = input_data.astype(self.input_dtype)
        return self.session.run(None, {self.input_name: input_data})

    def get_input_shape(self) -> list[int]:
        """Get the expected input shape of the model."""
        return self.input_shape

    def get_input_name(self) -> str:
        """Get the name of the input tensor."""
        return self.input_name

    def get_inputs(self) -> list:
        """Get all input information."""
        return self.session.get_inputs()

__init__(model_path, device='gpu')

Initialize the ONNX Runtime session with the best available provider for the chosen device.

Parameters:

Name Type Description Default
model_path str

Path to the ONNX model file.

required
device str

Device preference ("gpu" or "cpu").

'gpu'
Source code in python/modules/onnx_runtime.py
def __init__(self, model_path: str, device: str = "gpu"):
    """
    Initialize the ONNX Runtime session with the best available provider for the chosen device.

    Args:
        model_path (str): Path to the ONNX model file.
        device (str): Device preference ("gpu" or "cpu").
    """
    try:
        providers = self._get_best_providers(device)
        sess_options = ort.SessionOptions()
        # ORT_ENABLE_ALL allows the optimizer to convert FP16 nodes to FP32
        # when running on CPU (which does not natively support float16 tensors).
        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        self.session = ort.InferenceSession(
            model_path, sess_options=sess_options, providers=providers
        )
        self.input_info = self.session.get_inputs()[0]
        self.input_name = self.input_info.name
        self.input_shape = self.input_info.shape
        self.input_dtype = self._ort_type_to_numpy_dtype(self.input_info.type)

        print("Input metadata:")
        print(f"  - Name:  {self.input_name}")
        print(f"  - Shape: {self.input_shape}")
        print(f"  - Type:  {self.input_info.type} -> numpy: {self.input_dtype}")

        active_providers = self.session.get_providers()
        print(f"--- ONNX Runtime: Using {active_providers[0]} for inference ---")

        outputs = self.session.get_outputs()
        print(f"Output metadata ({len(outputs)} outputs):")
        for i, output in enumerate(outputs):
            print(f"  Output {i}:")
            print(f"    - Name:  {output.name}")
            print(f"    - Shape: {output.shape}")
            print(f"    - Type:  {output.type}")

        # Perform a warmup run to initialize CUDA/TensorRT
        if "TensorrtExecutionProvider" in active_providers or "CUDAExecutionProvider" in active_providers:
            print("--- ONNX Runtime: Warming up GPU... ---")
            dummy_input = np.zeros(self.input_shape, dtype=self.input_dtype)
            self.session.run(None, {self.input_name: dummy_input})
            print("--- ONNX Runtime: Warmup complete ---")
    except Exception as e:
        print(f"ERROR: Failed to load model '{model_path}' on {device}:")
        print(f"  {str(e)}")
        raise

run(input_data)

Run inference with the provided input data, casting to the model's expected dtype.

Source code in python/modules/onnx_runtime.py
def run(self, input_data: np.ndarray) -> list[np.ndarray]:
    """Run inference with the provided input data, casting to the model's expected dtype."""
    if input_data.dtype != self.input_dtype:
        input_data = input_data.astype(self.input_dtype)
    return self.session.run(None, {self.input_name: input_data})

get_input_shape()

Get the expected input shape of the model.

Source code in python/modules/onnx_runtime.py
def get_input_shape(self) -> list[int]:
    """Get the expected input shape of the model."""
    return self.input_shape

get_input_name()

Get the name of the input tensor.

Source code in python/modules/onnx_runtime.py
def get_input_name(self) -> str:
    """Get the name of the input tensor."""
    return self.input_name

get_inputs()

Get all input information.

Source code in python/modules/onnx_runtime.py
def get_inputs(self) -> list:
    """Get all input information."""
    return self.session.get_inputs()

modules.utils

Utility functions for image loading and coordinate conversions.

utils

open_image(path)

Open an image from a local path or a URL.

Source code in python/modules/utils.py
def open_image(path: str) -> Image.Image:
    """Open an image from a local path or a URL."""
    # Check if the path is a URL (starts with 'http://' or 'https://')
    if path.startswith('http://') or path.startswith('https://'):
        img = Image.open(io.BytesIO(requests.get(path).content))
    # If it's a local file path, open the image directly
    else:
        if os.path.exists(path):
            img = Image.open(path)
        else:
            raise FileNotFoundError(f"The file {path} does not exist.")
    return img

sigmoid(x)

Compute the sigmoid function.

Source code in python/modules/utils.py
def sigmoid(x: np.ndarray) -> np.ndarray:
    """Compute the sigmoid function."""
    return 1 / (1 + np.exp(-x))

box_cxcywh_to_xywh(x)

Convert boxes from center x, y, width, height (cxcywh) to x_left, y_top, width, height (xywh).

Source code in python/modules/utils.py
def box_cxcywh_to_xywh(x: np.ndarray) -> np.ndarray:
    """Convert boxes from center x, y, width, height (cxcywh) to x_left, y_top, width, height (xywh)."""
    cx, cy, w, h = x[..., 0], x[..., 1], x[..., 2], x[..., 3]
    x_left = cx - w / 2
    y_top = cy - h / 2
    return np.stack([x_left, y_top, w, h], axis=-1)

box_cxcywh_to_xyxyn(x)

Convert boxes from center x, y, width, height (cxcywh) to min/max format (xyxyn).

Source code in python/modules/utils.py
def box_cxcywh_to_xyxyn(x: np.ndarray) -> np.ndarray:
    """Convert boxes from center x, y, width, height (cxcywh) to min/max format (xyxyn)."""
    cx, cy, w, h = x[..., 0], x[..., 1], x[..., 2], x[..., 3]
    xmin = cx - w / 2
    ymin = cy - h / 2
    xmax = cx + w / 2
    ymax = cy + h / 2
    return np.stack([xmin, ymin, xmax, ymax], axis=-1)

Data Structures

Detection

A dataclass representing a single detection result:

Field Type Description
score float Confidence score in [0, 1]
label int Predicted class index
normalized_box np.ndarray Bounding box [x, y, w, h] normalized to [0, 1]
unnormalized_box np.ndarray Bounding box [x, y, w, h] in pixels
mask np.ndarray \| None Binary segmentation mask (H, W), or None for detection-only models

Timings Dictionary

predict() returns a dict[str, float] with timing in milliseconds:

Key Description
"preprocess" Image resize + normalize + tensor creation
"ort_run" ONNX Runtime session execution
"postprocess" Score filtering, box decoding, mask resize
"total" End-to-end wall time including I/O

Constants

Constant Value Description
DEFAULT_CONFIDENCE_THRESHOLD 0.5 Default score cutoff
DEFAULT_MAX_NUMBER_BOXES 300 Max detections returned