Simple object detection

2023-10-01 19:56:40 -05:00 · 2023-10-01 19:56:40 -05:00 · c56d7c86fc
parent 3a2ed7d4eb
commit c56d7c86fc
12 changed files with 2507 additions and 221 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 .env
 config/
+using_yolov8.ipynb
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.10.5
--- a/environment.yml
+++ b/environment.yml
--- a/main.py
+++ b/main.py
@ -1,209 +0,0 @@
-import datetime
-import face_recognition
-import cv2
-import numpy as np
-from dotenv import load_dotenv
-import os
-import json
-import pathlib
-import requests
-import time
-
-
-load_dotenv()
-URL = os.getenv("URL")
-RUN_SCALE = os.getenv("RUN_SCALE")
-VIEW_SCALE = os.getenv("VIEW_SCALE")
-DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
-# RUN_SCALE = 0.25
-# VIEW_SCALE = 0.75
-DISPLAY = False
-RUN_BY_COMPOSE = os.getenv("RUN_BY_COMPOSE")
-NTFY_URL = os.getenv("NTFY_URL")
-
-
-def find_face_from_name(name):
-    for face in config["faces"]:
-        if config["faces"][face]["name"] == name:
-            return face
-    return None
-
-
-def write_config():
-    with open(config_path, "w") as config_file:
-        json.dump(config, config_file, indent=4)
-
-
-print("Hello, world!")
-
-# Initialize some variables
-face_locations = []
-face_encodings = []
-face_names = []
-known_face_encodings = []
-known_face_names = []
-process_this_frame = True
-
-# Load the config file, if it does not exist or is blank, create it
-config = {
-    # If RUN_BY_COMPOSE is true, set url to rtsp://wyze-bridge:8554/wyze_cam_name, otherwise set it to "rtsp://localhost:8554/wyze_cam_name"
-    "URL": "rtsp://localhost:8554/wyze_cam_name"
-    if not RUN_BY_COMPOSE
-    else "rtsp://bridge:8554/wyze_cam_name",
-    "run_scale": "0.25",
-    "view_scale": "0.75",
-    "faces": {
-        "example1": {"image": "config/example1.jpg", "last_seen": ""},
-        "example2": {"image": "config/example2.jpg", "last_seen": ""},
-    },
-    "ntfy_url": "https://ntfy.sh/example",
-    "display": True,
-}
-config_path = pathlib.Path("config/config.json")
-if config_path.exists():
-    with open(config_path, "r") as config_file:
-        config = json.load(config_file)
-else:
-    with open(config_path, "w") as config_file:
-        json.dump(config, config_file, indent=4)
-    print("Config file created, please edit it and restart the program")
-    print("For relative paths, use the format config/example.jpg")
-    exit()
-
-
-if URL:
-    config["URL"] = URL
-else:
-    URL = config["URL"]
-if RUN_SCALE:
-    config["RUN_SCALE"] = RUN_SCALE
-else:
-    RUN_SCALE = float(config["RUN_SCALE"])
-if VIEW_SCALE:
-    config["VIEW_SCALE"] = VIEW_SCALE
-else:
-    VIEW_SCALE = float(config["VIEW_SCALE"])
-if DISPLAY:
-    config["DISPLAY"] = DISPLAY
-else:
-    DISPLAY = config["display"]
-if NTFY_URL:
-    config["ntfy_url"] = NTFY_URL
-else:
-    NTFY_URL = config["ntfy_url"]
-print(f"Current config: {config}")
-
-for face in config["faces"]:
-    # Load a sample picture and learn how to recognize it.
-    image = face_recognition.load_image_file(config["faces"][face]["image"])
-    face_encoding = face_recognition.face_encodings(image)[0]
-    known_face_encodings.append(face_encoding)
-    # Append the key to the list of known face names
-    known_face_names.append(face)
-
-video_capture = cv2.VideoCapture(URL)
-# Eliminate lag by setting the buffer size to 1
-# This makes it so that the video capture will only grab the most recent frame
-# However, this means that the video may be choppy
-video_capture.set(cv2.CAP_PROP_BUFFERSIZE, 1)
-
-# Print the resolution of the video
-print(
-    f"Video resolution: {video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)}x{video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)}"
-)
-
-print("Beginning video capture...")
-while True:
-    # Grab a single frame of video
-    ret, frame = video_capture.read()
-    # Only process every other frame of video to save time
-    # Resize frame of video to a smaller size for faster face recognition processing
-    run_frame = cv2.resize(frame, (0, 0), fx=RUN_SCALE, fy=RUN_SCALE)
-    view_frame = cv2.resize(frame, (0, 0), fx=VIEW_SCALE, fy=VIEW_SCALE)
-    # Convert the image from BGR color (which OpenCV uses) to RGB color (which face_recognition uses)
-    rgb_run_frame = run_frame[:, :, ::-1]
-    # Find all the faces and face encodings in the current frame of video
-    # model cnn is gpu accelerated, but hog is cpu only
-    face_locations = face_recognition.face_locations(
-        rgb_run_frame, model="hog"
-    )  # This crashes the program without output on my laptop when it's running without Docker compose
-    face_encodings = face_recognition.face_encodings(rgb_run_frame, face_locations)
-    face_names = []
-    for face_encoding in face_encodings:
-        # See if the face is a match for the known face(s)
-        matches = face_recognition.compare_faces(known_face_encodings, face_encoding)
-        name = "Unknown"
-        # Or instead, use the known face with the smallest distance to the new face
-        face_distances = face_recognition.face_distance(
-            known_face_encodings, face_encoding
-        )
-        best_match_index = np.argmin(face_distances)
-        if matches[best_match_index]:
-            name = known_face_names[best_match_index]
-            last_seen = config["faces"][name]["last_seen"]
-            # If it's never been seen, set the last seen time to x+5 seconds ago so it will be seen
-            # Kind of a hacky way to do it, but it works... hopefully
-            if last_seen == "":
-                print(f"{name} has been seen for the first time")
-                config["faces"][name]["last_seen"] = (
-                    datetime.datetime.now() - datetime.timedelta(seconds=15)
-                ).strftime(DATETIME_FORMAT)
-                write_config()
-            # Check if the face has been seen in the last 5 seconds
-            if datetime.datetime.now() - datetime.datetime.strptime(
-                last_seen, DATETIME_FORMAT
-            ) > datetime.timedelta(seconds=10):
-                print(f"{name} has been seen")
-                # Send a notification
-                print(f"Sending notification to{NTFY_URL}")
-                requests.post(
-                    NTFY_URL,
-                    data=f'"{name}" has been seen',
-                    headers={
-                        "Title": "Face Detected",
-                        "Priority": "default",
-                        "Tags": "neutral_face",
-                    },
-                )
-            # Update the last seen time
-            config["faces"][name]["last_seen"] = datetime.datetime.now().strftime(
-                DATETIME_FORMAT
-            )
-            # print("Writing config...")
-            write_config()
-        face_names.append(name)
-    # Display the results
-    # Iterate over each face found in the frame to draw a box around it
-    # Zip is used to iterate over two lists at the same time
-    for (top, right, bottom, left), name in zip(face_locations, face_names):
-        # print(f"Face found at {top}, {right}, {bottom}, {left} with name {name}")
-        # Scale back up face locations since the frame we detected in was scaled to 1/4 size
-        top = int(top * (VIEW_SCALE / RUN_SCALE))
-        right = int(right * (VIEW_SCALE / RUN_SCALE))
-        bottom = int(bottom * (VIEW_SCALE / RUN_SCALE))
-        left = int(left * (VIEW_SCALE / RUN_SCALE))
-
-        # Draw a box around the face
-        cv2.rectangle(view_frame, (left, top), (right, bottom), (0, 0, 255), 2)
-
-        # Draw a label with a name below the face
-        cv2.rectangle(
-            view_frame, (left, bottom - 35), (right, bottom), (0, 0, 255), cv2.FILLED
-        )
-        font = cv2.FONT_HERSHEY_DUPLEX
-        cv2.putText(
-            view_frame, name, (left + 6, bottom - 6), font, 1.0, (255, 255, 255), 1
-        )
-
-    # Display the resulting image if DISPLAY is set to true
-    if config["display"]:
-        cv2.imshow("Scaled View", view_frame)
-
-    # Hit 'q' on the keyboard to quit!
-    if cv2.waitKey(1) & 0xFF == ord("q"):
-        break
-
-# Release handle to the webcam
-print("Releasing video capture")
-video_capture.release()
-cv2.destroyAllWindows()
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,58 @@
+[tool.poetry]
+name = "detect-it"
+version = "0.1.0"
+description = "Detect all the things"
+authors = ["slashtechno <77907286+slashtechno@users.noreply.github.com>"]
+license = "MIT"
+readme = "README.md"
+packages = [{include = "detect_it"}]
+
+[tool.poetry.dependencies]
+python = "^3.10"
+python-dotenv = "^1.0.0"
+httpx = "^0.25.0"
+opencv-python = "^4.8.1.78"
+ultralytics = "^8.0.190"
+hjson = "^3.1.0"
+numpy = "^1.23.2"
+torch = [
+  { version = "^2.0.0+cu118", source = "torch_cu118", markers = "extra=='cuda'" },
+  { version = "^2.0.0+cpu", source = "torch_cpu", markers = "extra!='cuda'" },
+]
+torchaudio = [
+  { version = "^2.0.0+cu118", source = "torch_cu118", markers = "extra=='cuda'" },
+  { version = "^2.0.0+cpu", source = "torch_cpu", markers = "extra!='cuda'" },
+]
+torchvision = [
+  { version = "^0.15+cu118", source = "torch_cu118", markers = "extra=='cuda'" },
+  { version = "^0.15+cpu", source = "torch_cpu", markers = "extra!='cuda'" },
+]
+
+
+[tool.poetry.group.dev.dependencies]
+black = "^23.9.1"
+ruff = "^0.0.291"
+ipykernel = "^6.25.2"
+
+
+[[tool.poetry.source]]
+name = "torch_cpu"
+url = "https://download.pytorch.org/whl/cpu"
+priority = "supplemental"
+
+[[tool.poetry.source]]
+name = "torch_cu118"
+url = "https://download.pytorch.org/whl/cu118"
+priority = "supplemental"
+
+[tool.poetry.extras]
+cuda = []
+
+[[tool.poetry.source]]
+name = "PyPI"
+priority = "primary"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
--- a/requirements.txt
+++ b/requirements.txt
@ -1,11 +0,0 @@
-# certifi @ file:///croot/certifi_1665076670883/work/certifi
-click==8.1.3
-dlib==19.24.0
-face-recognition==1.3.0
-face-recognition-models==0.3.0
-numpy==1.23.5
-opencv-python==4.6.0.66
-Pillow==9.3.0
-python-dotenv==0.21.0
-urllib3==1.26.13
-requests==2.31.0
--- a/src/init.py
+++ b/src/init.py
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,129 @@
+# import face_recognition
+import cv2
+import numpy as np
+import dotenv
+from pathlib import Path
+import os
+import time
+# import hjson as json
+import torch
+from ultralytics import YOLO
+
+import argparse
+
+from .utils import notify, config_utils
+
+DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
+args = None
+
+def main():
+    global args
+    # RUN_BY_COMPOSE = os.getenv("RUN_BY_COMPOSE") # Replace this with code to check for gpu
+
+    if Path(".env").is_file():
+        dotenv.load_dotenv()
+        print("Loaded .env file")
+    else:
+        print("No .env file found")
+
+    argparser = argparse.ArgumentParser(
+        prog="Detect It",
+        description="Detect it all!",
+        epilog=":)",
+    )
+
+    # required='RUN_SCALE' not in os.environ, 
+
+    argparser.add_argument(
+    '--run-scale', 
+    # Set it to the env RUN_SCALE if it isn't blank, otherwise set it to 0.25
+    default=os.environ['RUN_SCALE'] if 'RUN_SCALE' in os.environ and os.environ['RUN_SCALE'] != '' else 0.25,  # noqa: E501
+    type=float,
+    help="The scale to run the detection at, default is 0.25",
+    )
+    # argparser.add_argument(
+    # '--view-scale',
+    # # Set it to the env VIEW_SCALE if it isn't blank, otherwise set it to 0.75
+    # default=os.environ['VIEW_SCALE'] if 'VIEW_SCALE' in os.environ and os.environ['VIEW_SCALE'] != '' else 0.75,  # noqa: E501
+    # type=float,
+    # help="The scale to view the detection at, default is 0.75",
+    # )
+
+    stream_source = argparser.add_mutually_exclusive_group() 
+    # stream_source.add_argument(
+    #     '--url',
+    #     default=os.environ['URL'] if 'URL' in os.environ and os.environ['URL'] != '' else None,  # noqa: E501
+    #     type=str,
+    #     help="The URL of the stream to use",
+    # )
+    stream_source.add_argument(
+        '--capture-device',
+        default=os.environ['CAPTURE_DEVICE'] if 'CAPTURE_DEVICE' in os.environ and os.environ['CAPTURE_DEVICE'] != '' else 0,  # noqa: E501
+        type=int,
+        help="The capture device to use. Can also be a url."
+    )
+
+    notifcation_services = argparser.add_argument_group("Notification Services")
+    notifcation_services.add_argument(
+        '--ntfy-url',
+        default=os.environ['NTFY_URL'] if 'NTFY_URL' in os.environ and os.environ['NTFY_URL'] != '' else None,  # noqa: E501
+        type=str,
+        help="The URL to send notifications to",
+    )
+
+    args = argparser.parse_args()
+
+    # Check if a CUDA GPU is available. If it is, set it via torch. Ff not, set it to cpu
+    # https://github.com/ultralytics/ultralytics/issues/3084#issuecomment-1732433168
+    device = "0" if torch.cuda.is_available() else "cpu"
+    if device == "0":
+        torch.cuda.set_device(0)
+        print("Set CUDA device")
+    else: 
+        print("No CUDA device available, using CPU")
+    
+    model = YOLO("yolov8n.pt")
+
+    video_capture = cv2.VideoCapture(args.capture_device)
+    # Eliminate lag by setting the buffer size to 1
+    # This makes it so that the video capture will only grab the most recent frame
+    # However, this means that the video may be choppy
+    video_capture.set(cv2.CAP_PROP_BUFFERSIZE, 1)
+
+
+    # Print the resolution of the video
+    print(
+        f"Video resolution: {video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)}x{video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)}"  # noqa: E501
+    )
+
+    print("Beginning video capture...")
+    while True:
+        # Grab a single frame of video
+        ret, frame = video_capture.read()
+        # Only process every other frame of video to save time
+        # Resize frame of video to a smaller size for faster recognition processing
+        run_frame = cv2.resize(frame, (0, 0), fx=args.run_scale, fy=args.run_scale)
+        # view_frame = cv2.resize(frame, (0, 0), fx=args.view_scale, fy=args.view_scale)
+    
+        results = model(run_frame)
+        for r in results:
+            
+            im_array = r.plot()
+            # Scale back up the coordinates of the locations of detected objects.
+            # im_array = np.multiply(im_array, 1/args.run_scale)
+            # print(type(im_array))
+            # print(im_array)
+            # exit()
+            cv2.imshow("View", im_array)
+            
+
+        # Hit 'q' on the keyboard to quit!
+        if cv2.waitKey(1) & 0xFF == ord("q"):
+            break
+
+    # Release handle to the webcam
+    print("Releasing video capture")
+    video_capture.release()
+    cv2.destroyAllWindows()
+
+main()
--- a/src/utils/init.py
+++ b/src/utils/init.py
--- a/src/utils/config_utils.py
+++ b/src/utils/config_utils.py
@ -0,0 +1,4 @@
+
+# def write_config():
+#     with open(config_path, "w") as config_file:
+#         json.dump(config, config_file, indent=4)
--- a/src/utils/notify.py
+++ b/src/utils/notify.py
@ -0,0 +1,38 @@
+import datetime
+import httpx
+
+
+def construct_ntfy_headers(
+        title: str = "Object/Person Detected",
+        tag = "rotating_light", # https://docs.ntfy.sh/publish/#tags-emojis
+        priority = "default", #  https://docs.ntfy.sh/publish/#message-priority
+) -> (dict):
+     return {
+          'Title': title,
+          'Priority': priority,
+         'Tags': tag
+     }
+
+def send_notification(
+        data: str,
+        headers: dict,
+        url: str
+):
+    if url is None or data is None:
+         raise ValueError("url and data cannot be None")
+    httpx.post(url, data=data.encode('utf-8'), headers=headers)
+
+def check_last_seen(last_seen: datetime.datetime, seconds: int = 15):
+        '''
+        Check if a time is older than a given number of seconds
+        If it is, return True
+        If last_seen is empty/null, return True
+        '''
+        if (
+             datetime.datetime.now() - last_seen > datetime.timedelta(seconds=seconds)
+            or last_seen == ""
+            or last_seen is None
+            ): 
+            return True
+        else:
+            return False