Simple object detection

2023-10-01 19:56:40 -05:00 · 2023-10-01 19:56:40 -05:00 · c56d7c86fc
parent 3a2ed7d4eb
commit c56d7c86fc
12 changed files with 2507 additions and 221 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 .env
 config/
 using_yolov8.ipynb
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
 3.10.5
--- a/environment.yml
+++ b/environment.yml
--- a/main.py
+++ b/main.py
@ -1,209 +0,0 @@
 import datetime
 import face_recognition
 import cv2
 import numpy as np
 from dotenv import load_dotenv
 import os
 import json
 import pathlib
 import requests
 import time
 load_dotenv()
 URL = os.getenv("URL")
 RUN_SCALE = os.getenv("RUN_SCALE")
 VIEW_SCALE = os.getenv("VIEW_SCALE")
 DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
 # RUN_SCALE = 0.25
 # VIEW_SCALE = 0.75
 DISPLAY = False
 RUN_BY_COMPOSE = os.getenv("RUN_BY_COMPOSE")
 NTFY_URL = os.getenv("NTFY_URL")
 def find_face_from_name(name):
    for face in config["faces"]:
        if config["faces"][face]["name"] == name:
            return face
    return None
 def write_config():
    with open(config_path, "w") as config_file:
        json.dump(config, config_file, indent=4)
 print("Hello, world!")
 # Initialize some variables
 face_locations = []
 face_encodings = []
 face_names = []
 known_face_encodings = []
 known_face_names = []
 process_this_frame = True
 # Load the config file, if it does not exist or is blank, create it
 config = {
    # If RUN_BY_COMPOSE is true, set url to rtsp://wyze-bridge:8554/wyze_cam_name, otherwise set it to "rtsp://localhost:8554/wyze_cam_name"
    "URL": "rtsp://localhost:8554/wyze_cam_name"
    if not RUN_BY_COMPOSE
    else "rtsp://bridge:8554/wyze_cam_name",
    "run_scale": "0.25",
    "view_scale": "0.75",
    "faces": {
        "example1": {"image": "config/example1.jpg", "last_seen": ""},
        "example2": {"image": "config/example2.jpg", "last_seen": ""},
    },
    "ntfy_url": "https://ntfy.sh/example",
    "display": True,
 }
 config_path = pathlib.Path("config/config.json")
 if config_path.exists():
    with open(config_path, "r") as config_file:
        config = json.load(config_file)
 else:
    with open(config_path, "w") as config_file:
        json.dump(config, config_file, indent=4)
    print("Config file created, please edit it and restart the program")
    print("For relative paths, use the format config/example.jpg")
    exit()
 if URL:
    config["URL"] = URL
 else:
    URL = config["URL"]
 if RUN_SCALE:
    config["RUN_SCALE"] = RUN_SCALE
 else:
    RUN_SCALE = float(config["RUN_SCALE"])
 if VIEW_SCALE:
    config["VIEW_SCALE"] = VIEW_SCALE
 else:
    VIEW_SCALE = float(config["VIEW_SCALE"])
 if DISPLAY:
    config["DISPLAY"] = DISPLAY
 else:
    DISPLAY = config["display"]
 if NTFY_URL:
    config["ntfy_url"] = NTFY_URL
 else:
    NTFY_URL = config["ntfy_url"]
 print(f"Current config: {config}")
 for face in config["faces"]:
    # Load a sample picture and learn how to recognize it.
    image = face_recognition.load_image_file(config["faces"][face]["image"])
    face_encoding = face_recognition.face_encodings(image)[0]
    known_face_encodings.append(face_encoding)
    # Append the key to the list of known face names
    known_face_names.append(face)
 video_capture = cv2.VideoCapture(URL)
 # Eliminate lag by setting the buffer size to 1
 # This makes it so that the video capture will only grab the most recent frame
 # However, this means that the video may be choppy
 video_capture.set(cv2.CAP_PROP_BUFFERSIZE, 1)
 # Print the resolution of the video
 print(
    f"Video resolution: {video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)}x{video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)}"
 )
 print("Beginning video capture...")
 while True:
    # Grab a single frame of video
    ret, frame = video_capture.read()
    # Only process every other frame of video to save time
    # Resize frame of video to a smaller size for faster face recognition processing
    run_frame = cv2.resize(frame, (0, 0), fx=RUN_SCALE, fy=RUN_SCALE)
    view_frame = cv2.resize(frame, (0, 0), fx=VIEW_SCALE, fy=VIEW_SCALE)
    # Convert the image from BGR color (which OpenCV uses) to RGB color (which face_recognition uses)
    rgb_run_frame = run_frame[:, :, ::-1]
    # Find all the faces and face encodings in the current frame of video
    # model cnn is gpu accelerated, but hog is cpu only
    face_locations = face_recognition.face_locations(
        rgb_run_frame, model="hog"
    )  # This crashes the program without output on my laptop when it's running without Docker compose
    face_encodings = face_recognition.face_encodings(rgb_run_frame, face_locations)
    face_names = []
    for face_encoding in face_encodings:
        # See if the face is a match for the known face(s)
        matches = face_recognition.compare_faces(known_face_encodings, face_encoding)
        name = "Unknown"
        # Or instead, use the known face with the smallest distance to the new face
        face_distances = face_recognition.face_distance(
            known_face_encodings, face_encoding
        )
        best_match_index = np.argmin(face_distances)
        if matches[best_match_index]:
            name = known_face_names[best_match_index]
            last_seen = config["faces"][name]["last_seen"]
            # If it's never been seen, set the last seen time to x+5 seconds ago so it will be seen
            # Kind of a hacky way to do it, but it works... hopefully
            if last_seen == "":
                print(f"{name} has been seen for the first time")
                config["faces"][name]["last_seen"] = (
                    datetime.datetime.now() - datetime.timedelta(seconds=15)
                ).strftime(DATETIME_FORMAT)
                write_config()
            # Check if the face has been seen in the last 5 seconds
            if datetime.datetime.now() - datetime.datetime.strptime(
                last_seen, DATETIME_FORMAT
            ) > datetime.timedelta(seconds=10):
                print(f"{name} has been seen")
                # Send a notification
                print(f"Sending notification to{NTFY_URL}")
                requests.post(
                    NTFY_URL,
                    data=f'"{name}" has been seen',
                    headers={
                        "Title": "Face Detected",
                        "Priority": "default",
                        "Tags": "neutral_face",
                    },
                )
            # Update the last seen time
            config["faces"][name]["last_seen"] = datetime.datetime.now().strftime(
                DATETIME_FORMAT
            )
            # print("Writing config...")
            write_config()
        face_names.append(name)
    # Display the results
    # Iterate over each face found in the frame to draw a box around it
    # Zip is used to iterate over two lists at the same time
    for (top, right, bottom, left), name in zip(face_locations, face_names):
        # print(f"Face found at {top}, {right}, {bottom}, {left} with name {name}")
        # Scale back up face locations since the frame we detected in was scaled to 1/4 size
        top = int(top * (VIEW_SCALE / RUN_SCALE))
        right = int(right * (VIEW_SCALE / RUN_SCALE))
        bottom = int(bottom * (VIEW_SCALE / RUN_SCALE))
        left = int(left * (VIEW_SCALE / RUN_SCALE))
        # Draw a box around the face
        cv2.rectangle(view_frame, (left, top), (right, bottom), (0, 0, 255), 2)
        # Draw a label with a name below the face
        cv2.rectangle(
            view_frame, (left, bottom - 35), (right, bottom), (0, 0, 255), cv2.FILLED
        )
        font = cv2.FONT_HERSHEY_DUPLEX
        cv2.putText(
            view_frame, name, (left + 6, bottom - 6), font, 1.0, (255, 255, 255), 1
        )
    # Display the resulting image if DISPLAY is set to true
    if config["display"]:
        cv2.imshow("Scaled View", view_frame)
    # Hit 'q' on the keyboard to quit!
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break
 # Release handle to the webcam
 print("Releasing video capture")
 video_capture.release()
 cv2.destroyAllWindows()
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,58 @@
 [tool.poetry]
 name = "detect-it"
 version = "0.1.0"
 description = "Detect all the things"
 authors = ["slashtechno <77907286+slashtechno@users.noreply.github.com>"]
 license = "MIT"
 readme = "README.md"
 packages = [{include = "detect_it"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 python-dotenv = "^1.0.0"
 httpx = "^0.25.0"
 opencv-python = "^4.8.1.78"
 ultralytics = "^8.0.190"
 hjson = "^3.1.0"
 numpy = "^1.23.2"
 torch = [
  { version = "^2.0.0+cu118", source = "torch_cu118", markers = "extra=='cuda'" },
  { version = "^2.0.0+cpu", source = "torch_cpu", markers = "extra!='cuda'" },
 ]
 torchaudio = [
  { version = "^2.0.0+cu118", source = "torch_cu118", markers = "extra=='cuda'" },
  { version = "^2.0.0+cpu", source = "torch_cpu", markers = "extra!='cuda'" },
 ]
 torchvision = [
  { version = "^0.15+cu118", source = "torch_cu118", markers = "extra=='cuda'" },
  { version = "^0.15+cpu", source = "torch_cpu", markers = "extra!='cuda'" },
 ]
 [tool.poetry.group.dev.dependencies]
 black = "^23.9.1"
 ruff = "^0.0.291"
 ipykernel = "^6.25.2"
 [[tool.poetry.source]]
 name = "torch_cpu"
 url = "https://download.pytorch.org/whl/cpu"
 priority = "supplemental"
 [[tool.poetry.source]]
 name = "torch_cu118"
 url = "https://download.pytorch.org/whl/cu118"
 priority = "supplemental"
 [tool.poetry.extras]
 cuda = []
 [[tool.poetry.source]]
 name = "PyPI"
 priority = "primary"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,11 +0,0 @@
 # certifi @ file:///croot/certifi_1665076670883/work/certifi
 click==8.1.3
 dlib==19.24.0
 face-recognition==1.3.0
 face-recognition-models==0.3.0
 numpy==1.23.5
 opencv-python==4.6.0.66
 Pillow==9.3.0
 python-dotenv==0.21.0
 urllib3==1.26.13
 requests==2.31.0
--- a/src/init.py
+++ b/src/init.py
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,129 @@
 # import face_recognition
 import cv2
 import numpy as np
 import dotenv
 from pathlib import Path
 import os
 import time
 # import hjson as json
 import torch
 from ultralytics import YOLO
 import argparse
 from .utils import notify, config_utils
 DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
 args = None
 def main():
    global args
    # RUN_BY_COMPOSE = os.getenv("RUN_BY_COMPOSE") # Replace this with code to check for gpu
    if Path(".env").is_file():
        dotenv.load_dotenv()
        print("Loaded .env file")
    else:
        print("No .env file found")
    argparser = argparse.ArgumentParser(
        prog="Detect It",
        description="Detect it all!",
        epilog=":)",
    )
    # required='RUN_SCALE' not in os.environ, 
    argparser.add_argument(
    '--run-scale', 
    # Set it to the env RUN_SCALE if it isn't blank, otherwise set it to 0.25
    default=os.environ['RUN_SCALE'] if 'RUN_SCALE' in os.environ and os.environ['RUN_SCALE'] != '' else 0.25,  # noqa: E501
    type=float,
    help="The scale to run the detection at, default is 0.25",
    )
    # argparser.add_argument(
    # '--view-scale',
    # # Set it to the env VIEW_SCALE if it isn't blank, otherwise set it to 0.75
    # default=os.environ['VIEW_SCALE'] if 'VIEW_SCALE' in os.environ and os.environ['VIEW_SCALE'] != '' else 0.75,  # noqa: E501
    # type=float,
    # help="The scale to view the detection at, default is 0.75",
    # )
    stream_source = argparser.add_mutually_exclusive_group() 
    # stream_source.add_argument(
    #     '--url',
    #     default=os.environ['URL'] if 'URL' in os.environ and os.environ['URL'] != '' else None,  # noqa: E501
    #     type=str,
    #     help="The URL of the stream to use",
    # )
    stream_source.add_argument(
        '--capture-device',
        default=os.environ['CAPTURE_DEVICE'] if 'CAPTURE_DEVICE' in os.environ and os.environ['CAPTURE_DEVICE'] != '' else 0,  # noqa: E501
        type=int,
        help="The capture device to use. Can also be a url."
    )
    notifcation_services = argparser.add_argument_group("Notification Services")
    notifcation_services.add_argument(
        '--ntfy-url',
        default=os.environ['NTFY_URL'] if 'NTFY_URL' in os.environ and os.environ['NTFY_URL'] != '' else None,  # noqa: E501
        type=str,
        help="The URL to send notifications to",
    )
    args = argparser.parse_args()
    # Check if a CUDA GPU is available. If it is, set it via torch. Ff not, set it to cpu
    # https://github.com/ultralytics/ultralytics/issues/3084#issuecomment-1732433168
    device = "0" if torch.cuda.is_available() else "cpu"
    if device == "0":
        torch.cuda.set_device(0)
        print("Set CUDA device")
    else: 
        print("No CUDA device available, using CPU")
    model = YOLO("yolov8n.pt")
    video_capture = cv2.VideoCapture(args.capture_device)
    # Eliminate lag by setting the buffer size to 1
    # This makes it so that the video capture will only grab the most recent frame
    # However, this means that the video may be choppy
    video_capture.set(cv2.CAP_PROP_BUFFERSIZE, 1)
    # Print the resolution of the video
    print(
        f"Video resolution: {video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)}x{video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)}"  # noqa: E501
    )
    print("Beginning video capture...")
    while True:
        # Grab a single frame of video
        ret, frame = video_capture.read()
        # Only process every other frame of video to save time
        # Resize frame of video to a smaller size for faster recognition processing
        run_frame = cv2.resize(frame, (0, 0), fx=args.run_scale, fy=args.run_scale)
        # view_frame = cv2.resize(frame, (0, 0), fx=args.view_scale, fy=args.view_scale)
        results = model(run_frame)
        for r in results:
            im_array = r.plot()
            # Scale back up the coordinates of the locations of detected objects.
            # im_array = np.multiply(im_array, 1/args.run_scale)
            # print(type(im_array))
            # print(im_array)
            # exit()
            cv2.imshow("View", im_array)
        # Hit 'q' on the keyboard to quit!
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    # Release handle to the webcam
    print("Releasing video capture")
    video_capture.release()
    cv2.destroyAllWindows()
 main()
--- a/src/utils/init.py
+++ b/src/utils/init.py
--- a/src/utils/config_utils.py
+++ b/src/utils/config_utils.py
@ -0,0 +1,4 @@
 # def write_config():
 #     with open(config_path, "w") as config_file:
 #         json.dump(config, config_file, indent=4)
--- a/src/utils/notify.py
+++ b/src/utils/notify.py
@ -0,0 +1,38 @@
 import datetime
 import httpx
 def construct_ntfy_headers(
        title: str = "Object/Person Detected",
        tag = "rotating_light", # https://docs.ntfy.sh/publish/#tags-emojis
        priority = "default", #  https://docs.ntfy.sh/publish/#message-priority
 ) -> (dict):
     return {
          'Title': title,
          'Priority': priority,
         'Tags': tag
     }
 def send_notification(
        data: str,
        headers: dict,
        url: str
 ):
    if url is None or data is None:
         raise ValueError("url and data cannot be None")
    httpx.post(url, data=data.encode('utf-8'), headers=headers)
 def check_last_seen(last_seen: datetime.datetime, seconds: int = 15):
        '''
        Check if a time is older than a given number of seconds
        If it is, return True
        If last_seen is empty/null, return True
        '''
        if (
             datetime.datetime.now() - last_seen > datetime.timedelta(seconds=seconds)
            or last_seen == ""
            or last_seen is None
            ): 
            return True
        else:
            return False