In [7]:
%pip install ultralytics 

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import cv2
import numpy as np
from ultralytics import YOLO
from torchvision import models, transforms
from sklearn.cluster import KMeans
import torch
from matplotlib import pyplot as plt
from PIL import Image

In [4]:
model = YOLO("yolov8n.pt").to('cuda')

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 10.3MB/s]


In [11]:
print(model.device)

cuda:0


In [175]:
im = Image.open("dataset/1818949000-IMG-20240118-WA0001.jpg")

In [176]:
im = transforms.ToTensor()(im)
# im = im.numpy()

In [130]:
im = im.reshape(-1, im.shape[0], im.shape[1], im.shape[2])

# apply resize to image 3, 640, 640


In [161]:
im.shape

(1599, 899, 3)

In [116]:
def resize_and_pad_image(im, stride=32):
    # Get original dimensions
    _, _, h, w = im.shape

    # Calculate the nearest divisible dimensions
    new_h = int(np.ceil(h / stride) * stride)
    new_w = int(np.ceil(w / stride) * stride)

    # Resize the image while maintaining aspect ratio
    resize_transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((new_h, new_w)),  # Resize to divisible dimensions
        transforms.ToTensor()
    ])

    # Apply transform to the tensor
    im_resized = resize_transform(im.squeeze(0))  # Remove batch dimension for processing

    # Add batch dimension back
    im_resized = im_resized.unsqueeze(0)
    return im_resized

In [117]:
im = resize_and_pad_image(im)

In [177]:
im = im.numpy()

In [180]:
im.shape

(1599, 899, 3)

In [179]:
im = im.transpose(1,2,0)

In [181]:
model.eval()

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1))
          (act): SiLU(inplace=True)
        )
        (m): ModuleList(
          (0): Bottleneck(
            (cv1): Conv(
              (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
              (act): SiLU(inplace=True)
            )
            (cv2): Conv(
              (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
              (act): SiLU(inplace=True)
   

In [234]:
with torch.no_grad():
    pred = model(im)


0: 640x384 (no detections), 23.0ms
Speed: 23.6ms preprocess, 23.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 384)


In [241]:
results = model.predict(source="dataset/1818949000-IMG-20240118-WA0001.jpg", save=False)


image 1/1 e:\Facultate\Master\Anul 1\CV\Project\dataset\1818949000-IMG-20240118-WA0001.jpg: 640x384 1 cat, 1 chair, 29.5ms
Speed: 5.0ms preprocess, 29.5ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)


In [242]:
for result in results:
    boxes = result.boxes  # Bounding box information

    for box in boxes:
        # Box coordinates
        x_min, y_min, x_max, y_max = box.xyxy[0]  # Format: [x_min, y_min, x_max, y_max]

        # Confidence score
        confidence = box.conf[0]

        # Class ID or name
        class_id = box.cls[0]
        class_name = model.names[int(class_id)]  # Convert class ID to class name

        print(f"Class: {class_name}, Confidence: {confidence:.2f}, Box: {x_min}, {y_min}, {x_max}, {y_max}")

Class: cat, Confidence: 0.79, Box: 1.4070484638214111, 389.7481994628906, 766.0083618164062, 1191.5501708984375
Class: chair, Confidence: 0.27, Box: 0.0, 255.45545959472656, 884.272216796875, 1599.0


In [251]:
image = cv2.imread("dataset/1818949000-IMG-20240118-WA0001.jpg")
for result in results:
    boxes = result.boxes

    for box in boxes:
        x_min, y_min, x_max, y_max = map(int, box.xyxy[0])
        class_id = int(box.cls[0])
        class_name = model.names[class_id]
        confidence = box.conf[0]

        if confidence > 0.7:

            # Format the label with class name and confidence
            label = f"{class_name} {confidence:.2f}"

            # Draw the bounding box
            cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
            cv2.putText(image, label, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

# Save or display the image
cv2.imwrite("output/output.jpg", image)

True