predictions [batch_size, 2] target_sizes = torch.Tensor([image.size[::-1]]) # Convert outputs (bounding boxes and class logits) to COCO API results = processor.post_process(outputs=outputs, target_sizes=target_sizes) i = 0 # Retrieve predictions for the first image for the corresponding text queries text = texts[i] boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] score_threshold = 0.1 for box, score, label in zip(boxes, scores, labels): box = [round(i, 2) for i in box.tolist()] if score >= score_threshold: print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") 47