Compare commits

...

2 Commits

Author SHA1 Message Date
2434635399 test bbox 2025-01-23 02:15:48 +02:00
495c75a86d test bbox 2025-01-23 02:14:37 +02:00
5 changed files with 590 additions and 15 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/dataset/cats
/dataset/cats.zip
/output

View File

@ -58,19 +58,9 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Creating new Ultralytics Settings v0.0.6 file \n",
"View Ultralytics Settings with 'yolo settings' or at 'C:\\Users\\danie\\AppData\\Roaming\\Ultralytics\\settings.json'\n",
"Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.\n"
]
}
],
"outputs": [],
"source": [
"import os\n",
"import cv2\n",
@ -85,19 +75,601 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"YOLO\n"
"Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 6.25M/6.25M [00:00<00:00, 10.3MB/s]\n"
]
}
],
"source": [
"print(\"YOLO\")"
"model = YOLO(\"yolov8n.pt\").to('cuda')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cuda:0\n"
]
}
],
"source": [
"print(model.device)"
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {},
"outputs": [],
"source": [
"im = Image.open(\"dataset/1818949000-IMG-20240118-WA0001.jpg\")"
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {},
"outputs": [],
"source": [
"im = transforms.ToTensor()(im)\n",
"# im = im.numpy()"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"im = im.reshape(-1, im.shape[0], im.shape[1], im.shape[2])\n",
"\n",
"# apply resize to image 3, 640, 640\n"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1599, 899, 3)"
]
},
"execution_count": 161,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"im.shape"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [],
"source": [
"def resize_and_pad_image(im, stride=32):\n",
" # Get original dimensions\n",
" _, _, h, w = im.shape\n",
"\n",
" # Calculate the nearest divisible dimensions\n",
" new_h = int(np.ceil(h / stride) * stride)\n",
" new_w = int(np.ceil(w / stride) * stride)\n",
"\n",
" # Resize the image while maintaining aspect ratio\n",
" resize_transform = transforms.Compose([\n",
" transforms.ToPILImage(),\n",
" transforms.Resize((new_h, new_w)), # Resize to divisible dimensions\n",
" transforms.ToTensor()\n",
" ])\n",
"\n",
" # Apply transform to the tensor\n",
" im_resized = resize_transform(im.squeeze(0)) # Remove batch dimension for processing\n",
"\n",
" # Add batch dimension back\n",
" im_resized = im_resized.unsqueeze(0)\n",
" return im_resized"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"im = resize_and_pad_image(im)"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [],
"source": [
"im = im.numpy()"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1599, 899, 3)"
]
},
"execution_count": 180,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"im.shape"
]
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {},
"outputs": [],
"source": [
"im = im.transpose(1,2,0)"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"YOLO(\n",
" (model): DetectionModel(\n",
" (model): Sequential(\n",
" (0): Conv(\n",
" (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (1): Conv(\n",
" (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (2): C2f(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (m): ModuleList(\n",
" (0): Bottleneck(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (3): Conv(\n",
" (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (4): C2f(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (m): ModuleList(\n",
" (0-1): 2 x Bottleneck(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (5): Conv(\n",
" (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (6): C2f(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(128, 128, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (m): ModuleList(\n",
" (0-1): 2 x Bottleneck(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (7): Conv(\n",
" (conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (8): C2f(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(384, 256, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (m): ModuleList(\n",
" (0): Bottleneck(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (9): SPPF(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (m): MaxPool2d(kernel_size=5, stride=1, padding=2, dilation=1, ceil_mode=False)\n",
" )\n",
" (10): Upsample(scale_factor=2.0, mode='nearest')\n",
" (11): Concat()\n",
" (12): C2f(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(384, 128, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(192, 128, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (m): ModuleList(\n",
" (0): Bottleneck(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (13): Upsample(scale_factor=2.0, mode='nearest')\n",
" (14): Concat()\n",
" (15): C2f(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(96, 64, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (m): ModuleList(\n",
" (0): Bottleneck(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (16): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (17): Concat()\n",
" (18): C2f(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(192, 128, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(192, 128, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (m): ModuleList(\n",
" (0): Bottleneck(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (19): Conv(\n",
" (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (20): Concat()\n",
" (21): C2f(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(384, 256, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(384, 256, kernel_size=(1, 1), stride=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (m): ModuleList(\n",
" (0): Bottleneck(\n",
" (cv1): Conv(\n",
" (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (cv2): Conv(\n",
" (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (22): Detect(\n",
" (cv2): ModuleList(\n",
" (0): Sequential(\n",
" (0): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (1): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))\n",
" )\n",
" (1): Sequential(\n",
" (0): Conv(\n",
" (conv): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (1): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))\n",
" )\n",
" (2): Sequential(\n",
" (0): Conv(\n",
" (conv): Conv2d(256, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (1): Conv(\n",
" (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (2): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1))\n",
" )\n",
" )\n",
" (cv3): ModuleList(\n",
" (0): Sequential(\n",
" (0): Conv(\n",
" (conv): Conv2d(64, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (1): Conv(\n",
" (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (2): Conv2d(80, 80, kernel_size=(1, 1), stride=(1, 1))\n",
" )\n",
" (1): Sequential(\n",
" (0): Conv(\n",
" (conv): Conv2d(128, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (1): Conv(\n",
" (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (2): Conv2d(80, 80, kernel_size=(1, 1), stride=(1, 1))\n",
" )\n",
" (2): Sequential(\n",
" (0): Conv(\n",
" (conv): Conv2d(256, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (1): Conv(\n",
" (conv): Conv2d(80, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
" (act): SiLU(inplace=True)\n",
" )\n",
" (2): Conv2d(80, 80, kernel_size=(1, 1), stride=(1, 1))\n",
" )\n",
" )\n",
" (dfl): DFL(\n",
" (conv): Conv2d(16, 1, kernel_size=(1, 1), stride=(1, 1), bias=False)\n",
" )\n",
" )\n",
" )\n",
" )\n",
")"
]
},
"execution_count": 181,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.eval()"
]
},
{
"cell_type": "code",
"execution_count": 234,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"0: 640x384 (no detections), 23.0ms\n",
"Speed: 23.6ms preprocess, 23.0ms inference, 3.0ms postprocess per image at shape (1, 3, 640, 384)\n"
]
}
],
"source": [
"with torch.no_grad():\n",
" pred = model(im)"
]
},
{
"cell_type": "code",
"execution_count": 241,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"image 1/1 e:\\Facultate\\Master\\Anul 1\\CV\\Project\\dataset\\1818949000-IMG-20240118-WA0001.jpg: 640x384 1 cat, 1 chair, 29.5ms\n",
"Speed: 5.0ms preprocess, 29.5ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 384)\n"
]
}
],
"source": [
"results = model.predict(source=\"dataset/1818949000-IMG-20240118-WA0001.jpg\", save=False)"
]
},
{
"cell_type": "code",
"execution_count": 242,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Class: cat, Confidence: 0.79, Box: 1.4070484638214111, 389.7481994628906, 766.0083618164062, 1191.5501708984375\n",
"Class: chair, Confidence: 0.27, Box: 0.0, 255.45545959472656, 884.272216796875, 1599.0\n"
]
}
],
"source": [
"for result in results:\n",
" boxes = result.boxes # Bounding box information\n",
"\n",
" for box in boxes:\n",
" # Box coordinates\n",
" x_min, y_min, x_max, y_max = box.xyxy[0] # Format: [x_min, y_min, x_max, y_max]\n",
"\n",
" # Confidence score\n",
" confidence = box.conf[0]\n",
"\n",
" # Class ID or name\n",
" class_id = box.cls[0]\n",
" class_name = model.names[int(class_id)] # Convert class ID to class name\n",
"\n",
" print(f\"Class: {class_name}, Confidence: {confidence:.2f}, Box: {x_min}, {y_min}, {x_max}, {y_max}\")"
]
},
{
"cell_type": "code",
"execution_count": 251,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 251,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"image = cv2.imread(\"dataset/1818949000-IMG-20240118-WA0001.jpg\")\n",
"for result in results:\n",
" boxes = result.boxes\n",
"\n",
" for box in boxes:\n",
" x_min, y_min, x_max, y_max = map(int, box.xyxy[0])\n",
" class_id = int(box.cls[0])\n",
" class_name = model.names[class_id]\n",
" confidence = box.conf[0]\n",
"\n",
" if confidence > 0.7:\n",
"\n",
" # Format the label with class name and confidence\n",
" label = f\"{class_name} {confidence:.2f}\"\n",
"\n",
" # Draw the bounding box\n",
" cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)\n",
" cv2.putText(image, label, (x_min, y_min - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)\n",
"\n",
"# Save or display the image\n",
"cv2.imwrite(\"output/output.jpg\", image)"
]
},
{

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

0
dataset_download.py Normal file
View File

BIN
yolov8n.pt Normal file

Binary file not shown.