Detect by YOLO and track by dlib

* dlib sorta miss a lot
5 years ago · 84a7df960a
4 changed files with 470 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
 *__pycache__*
 *.mp4
 .DS_Store
--- a/src/main.py
+++ b/src/main.py
@ -0,0 +1,337 @@
 """USAGE:
 time python src/_detector.py --input ~/Desktop/5min.mp4 -o output.mp4
 time python src/_detector.py --input ~/Desktop/5min.mp4 -l
 """
 # import the necessary packages
 import numpy as np
 import argparse
 import imutils
 import time
 import cv2
 import os
 import dlib
 from utils import check_if_inside_the_boxes, is_it_the_same_obj, distance
 # tracking
 OPENCV_OBJECT_TRACKERS = {"csrt": cv2.TrackerCSRT_create}
 trackers = []
 finished = []
 # construct the argument parse and parse the arguments
 ap = argparse.ArgumentParser()
 ap.add_argument("-i", "--input", required=True, help="path to input video")
 ap.add_argument("-o", "--output", required=False, help="path to output video")
 ap.add_argument("-l", "--live", action="store_true", help="Show live detection")
 # ap.add_argument("-y", "--yolo", required=True,
 # 	help="base path to YOLO directory")
 ap.add_argument(
    "-c",
    "--confidence",
    type=float,
    default=0.95,
    help="minimum probability to filter weak detections",
 )
 ap.add_argument(
    "-t",
    "--threshold",
    type=float,
    default=0.3,
    help="threshold when applyong non-maxima suppression",
 )
 args = vars(ap.parse_args())
 # load the COCO class labels our YOLO model was trained on
 # labelsPath = os.path.sep.join([args["yolo"], "coco.names"])
 labelsPath = "/Users/sipp11/syncthing/dropbox/tracking-obj/mytrain.names"
 LABELS = open(labelsPath).read().strip().split("\n")
 # 0 person, 1 wheelchair, 2 bicycle, 3 motorbike, 4 car, 5 bus, 6 truck
 # initialize a list of colors to represent each possible class label
 np.random.seed(42)
 COLORS = np.random.randint(0, 255, size=(len(LABELS), 3), dtype="uint8")
 # derive the paths to the YOLO weights and model configuration
 weightsPath = "/Users/sipp11/syncthing/dropbox/tracking-obj/mytrain_final.weights"
 configPath = "/Users/sipp11/syncthing/dropbox/tracking-obj/mytrain.cfg"
 # load our YOLO object detector trained on COCO dataset (80 classes)
 # and determine only the *output* layer names that we need from YOLO
 print("[INFO] loading YOLO from disk...")
 net = cv2.dnn.readNetFromDarknet(configPath, weightsPath)
 ln = net.getLayerNames()
 ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
 def detect_stuffs(net, frame):
    # construct a blob from the input frame and then perform a forward
    # pass of the YOLO object detector, giving us our bounding boxes
    # and associated probabilities
    blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    start = time.time()
    layerOutputs = net.forward(ln)
    end = time.time()
    # initialize our lists of detected bounding boxes, confidences,
    # and class IDs, respectively
    boxes = []
    confidences = []
    classIDs = []
    # loop over each of the layer outputs
    for output in layerOutputs:
        # print(f'[{_frame_count:08d}] output -> ', len(output))
        # loop over each of the detections
        for detection in output:
            # extract the class ID and confidence (i.e., probability)
            # of the current object detection
            scores = detection[5:]
            classID = np.argmax(scores)
            confidence = scores[classID]
            # filter out weak predictions by ensuring the detected
            # probability is greater than the minimum probability
            if confidence <= args["confidence"]:
                continue
            # scale the bounding box coordinates back relative to
            # the size of the image, keeping in mind that YOLO
            # actually returns the center (x, y)-coordinates of
            # the bounding box followed by the boxes' width and
            # height
            box = detection[0:4] * np.array([W, H, W, H])
            (centerX, centerY, width, height) = box.astype("int")
            # use the center (x, y)-coordinates to derive the top
            # and and left corner of the bounding box
            x = int(centerX - (width / 2))
            y = int(centerY - (height / 2))
            # update our list of bounding box coordinates,
            # confidences, and class IDs
            boxes.append([x, y, int(width), int(height)])
            confidences.append(float(confidence))
            classIDs.append(classID)
        # apply non-maxima suppression to suppress weak, overlapping
        # bounding boxes
        idxs = cv2.dnn.NMSBoxes(
            boxes, confidences, args["confidence"], args["threshold"]
        )
        # ensure at least one detection exists
        if len(idxs) == 0:
            continue
        # NOTE: we are not going to draw anything from DETECTION,
        #       only from tracking one
        # loop over the indexes we are keeping
        # for i in idxs.flatten():
        #     # extract the bounding box coordinates
        #     (x, y) = (boxes[i][0], boxes[i][1])
        #     (w, h) = (boxes[i][2], boxes[i][3])
        #     # draw a bounding box rectangle and label on the frame
        #     color = [int(c) for c in COLORS[classIDs[i]]]
        #     cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
        #     text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i])
        #     cv2.putText(
        #         frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2
        #     )
    return idxs, boxes, confidences, classIDs, start, end
 # initialize the video stream, pointer to output video file, and
 # frame dimensions
 vs = cv2.VideoCapture(args["input"])
 writer = None
 (W, H) = (None, None)
 # try to determine the total number of frames in the video file
 try:
    prop = (
        cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() else cv2.CAP_PROP_FRAME_COUNT
    )
    total = int(vs.get(prop))
    print("[INFO] {} total frames in video".format(total))
 # an error occurred while trying to determine the total
 # number of frames in the video file
 except:
    print("[INFO] could not determine # of frames in video")
    print("[INFO] no approx. completion time can be provided")
    total = -1
 _frame_count = 0
 tracker_counter = 1
 # loop over frames from the video file stream
 while True:
    # read the next frame from the file
    (grabbed, frame) = vs.read()
    # if the frame was not grabbed, then we have reached the end
    # of the stream
    if not grabbed:
        break
    # if the frame dimensions are empty, grab them
    if W is None or H is None:
        (H, W) = frame.shape[:2]
    _frame_count += 1
    # for dlib
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # only detect once a sec
    if _frame_count % 15 == 1:
        idxs, boxes, confidences, classIDs, start, end = detect_stuffs(net, frame)
        # loop over the indexes we are keeping
        for i in idxs.flatten():
            # extract the bounding box coordinates
            (x, y) = (boxes[i][0], boxes[i][1])
            (w, h) = (boxes[i][2], boxes[i][3])
            _class = LABELS[classIDs[i]]
            _good = check_if_inside_the_boxes(x, y, w, h, _class)
            if not _good:
                continue
            # (1) check whether it's the same object as one in trackers
            is_same = False
            for t in trackers:
                tracker = t["tracker"]
                if _class != t["type"]:
                    continue
                pos = tracker.get_position()
                i = int(pos.left())
                j = int(pos.top())
                _w = int(pos.right()) - i
                _h = int(pos.bottom()) - j
                print(f"[{t['id']}] - {t['type']}")
                is_same = is_it_the_same_obj(x, y, w, h, i, j, _w, _h, id=t["id"])
                if is_same:
                    break
            if not is_same:
                # add tracker to this obj
                # create a new object tracker for the bounding box and add it
                # to our multi-object tracker
                # tracker = OPENCV_OBJECT_TRACKERS[args["tracker"]]()
                # trackers.add(tracker, frame, box)
                tracker = dlib.correlation_tracker()
                rect = dlib.rectangle(x, y, x + w, y + h)
                print("NEW TRACKER rect", rect)
                t = {
                    "id": tracker_counter,
                    "type": _class,
                    "tracker": tracker,
                    "direction": "",
                    "last_distance": -1,
                    "last_position": (x + w / 2, y + h / 2),
                    "still": 0,
                }
                tracker_counter += 1
                tracker.start_track(frame_rgb, rect)
                trackers.append(t)
                print(f" i -> {i} ({x},{y}), {w},{h} ({x + w},{y + h})")
            # # draw a bounding box rectangle and label on the frame
            # color = [int(c) for c in COLORS[classIDs[i]]]
            # cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
            # text = "{}: {:.4f}".format(LABELS[classIDs[i]], confidences[i])
            # cv2.putText(
            #     frame, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2
            # )
        _what = ",".join([LABELS[c] for c in classIDs])
        print(f"[{_frame_count:08d}] :: {_what}")
    untracking = []
    for tk in trackers:
        tk["tracker"].update(frame_rgb)
        pos = tk["tracker"].get_position()
        # unpack the position object
        startX = int(pos.left())
        startY = int(pos.top())
        endX = int(pos.right())
        endY = int(pos.bottom())
        tcx, tcy = (startX + endX) / 2, (startY + endY) / 2
        # calculate distance
        _x, _y = tk["last_position"]
        _d = distance(_x, _y, tcx, tcy)
        _last_distance = tk["last_distance"]
        tk["last_distance"] = _d
        tk["last_position"] = (tcx, tcy)
        STILL_DISTANCE_IN_PX = 2
        if _last_distance < STILL_DISTANCE_IN_PX and _d < STILL_DISTANCE_IN_PX:
            tk["still"] += 1
        else:
            tk["still"] = 0
        if tk["still"] > 30 or tcx < 10 or tcx > 1200:
            untracking.append(tk)
        cv2.rectangle(frame, (startX, startY), (endX, endY), (0, 255, 0), 2)
        color = [int(c) for c in COLORS[0]]
        print(
            f"{tk['id']} - {tk['type']} - centroid: {tcx, tcy} - distance: [stl:{tk['still']}] {_last_distance:.3f} -> {_d:.3f}"
        )
        cv2.putText(
            frame,
            f"{tk['id']} - {tk['type']}",
            (startX, startY - 5),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            color,
            2,
        )
    # untracking
    untracking_ids = [ut["id"] for ut in untracking]
    trackers = [tk for tk in trackers if tk["id"] not in untracking_ids]
    finished += untracking
    if args["live"]:
        cv2.imshow("Frame", frame)
        key = cv2.waitKey(1) & 0xFF
        # if the `q` key was pressed, break from the loop
        if key == ord("q"):
            break
    if args["output"]:
        # check if the video writer is None
        if writer is None:
            # initialize our video writer
            fourcc = cv2.VideoWriter_fourcc(*"MJPG")
            writer = cv2.VideoWriter(
                args["output"], fourcc, 30, (frame.shape[1], frame.shape[0]), True
            )
            # some information on processing single frame
            if total > 0:
                elap = end - start
                print("[INFO] single frame took {:.4f} seconds".format(elap))
                print(
                    "[INFO] estimated total time to finish: {:.4f}".format(elap * total)
                )
        # write the output frame to disk
        writer.write(frame)
 # release the file pointers
 print("[INFO] cleaning up...")
 if writer:
    writer.release()
 vs.release()
--- a/src/utils.py
+++ b/src/utils.py
@ -0,0 +1,119 @@
 import math
 # detecting area
 AREAS = [
    [
        ("id", 1),
        ("area", ((0, 40), (12, 129))),
        ("target", ["car", "bus", "motorbike"]),
    ],
    [("id", 2), ("area", ((85, 0), (222, 74))), ("target", ["person", "bicycle"])],
    [("id", 3), ("area", ((38, 340), (99, 482))), ("target", ["person", "wheelchair"])],
    [
        ("id", 4),
        ("area", ((106, 310), (164, 461))),
        ("target", ["person", "wheelchair"]),
    ],
    [
        ("id", 5),
        ("area", ((286, 230), (441, 346))),
        ("target", ["person", "wheelchair"]),
    ],
    [
        ("id", 6),
        ("area", ((421, 190), (555, 304))),
        ("target", ["car", "bus", "motorbike"]),
    ],
    [
        ("id", 7),
        ("area", ((555, 170), (720, 295))),
        ("target", ["person", "wheelchair", "bicycle"]),
    ],
    [
        ("id", 8),
        ("area", ((877, 224), (947, 334))),
        ("target", ["person", "wheelchair"]),
    ],
    [
        ("id", 9),
        ("area", ((1047, 229), (112, 338))),
        ("target", ["person", "wheelchair"]),
    ],
    [
        ("id", 10),
        ("area", ((1158, 200), (1230, 307))),
        ("target", ["person", "wheelchair"]),
    ],
 ]
 def distance(x2, y2, x1, y1):
    return math.sqrt(math.pow(x2 - x1, 2) + math.pow(y2 - y1, 2))
 def check_if_inside_the_boxes(x, y, w, h, _type):
    cx, cy = x + w / 2, y + h / 2
    # print(cx, cy)
    is_inside = False
    for _box in AREAS:
        if is_inside:
            break
        box = dict(_box)
        ((x1, y1), (x2, y2)) = box["area"]
        # print(x1, cx, x2, ' -- ', y1, cy, y2, _type, box['target'])
        if x1 < cx and cx < x2 and y1 < cy and cy < y2 and _type in box["target"]:
            # print('inside --> ', _type, cx, cy, box['id'])
            is_inside = True
        # if diff_x < box_w
    if is_inside:
        print("INSIDE!! this -> ", box)
    return is_inside
 def is_it_the_same_obj(x1, y1, w1, h1, i1, j1, w2, h2, **kwargs):
    """We would use the centroid location to check whether they are the same
    object and of course, dimension too.
    """
    _id = kwargs.get("id", None)
    if _id:
        print(" :: check against id:", _id)
    DIMENSION_SHIFT = 0.15
    # we have to use centroid !! from the experience
    cx1, cy1, cx2, cy2 = x1 + w1 / 2, y1 + h1 / 2, i1 + w2 / 2, j1 + h2 / 2
    c_dff_x, c_dff_y = abs(cx2 - cx1), abs(cy2 - cy1)
    w_shift, h_shift = w1 * DIMENSION_SHIFT, h1 * DIMENSION_SHIFT
    print(" ::SAME:: shift")
    print(f" ---> SHIFT --> w:{w_shift}, h:{h_shift}")
    print(f" ---> centroid  {c_dff_x}, {c_dff_y}")
    if c_dff_x > w_shift and c_dff_y > h_shift:
        print(" ::SAME:: shift too much already -- NOT THE SAME")
        return False
    # if one inside the other
    if i1 > x1 and (w1 - w2) > i1 - x1 and j1 > y1 and h1 - h2 > j1 - y1:
        # one is inside the other
        print(" ::SAME:: new one inside existing tracker")
        return True
    if x1 > i1 and (w2 - w1) > x1 - i1 and y1 > j1 and h2 - h1 > y1 - j1:
        # one is inside the other
        print(" ::SAME:: existing tracker inside new tracker")
        return True
    # if it's not inside the other, then we can use "size" if it's different
    size1, size2 = w1 * h1, w2 * h2
    # if size is larger than 20%, then it's not the same thing
    print(f" ---> size  {size1}, {size2}, diff % : {abs(size2 - size1)/size1}")
    print(" ::SAME:: size")
    if abs(size2 - size1) / size1 > 0.45:
        print(" ::SAME:: too diff in size -- NOT THE SAME")
        return False
    print(" ::SAME:: last")
    return True
 if __name__ == "__main__":
    check_if_inside_the_boxes(461, 263, 24, 65, "person")
    check_if_inside_the_boxes(8, 45, 172, 193, "bus")
    check_if_inside_the_boxes(300, 300, 24, 65, "person")
--- a/src/yolo.py
+++ b/src/yolo.py
@ -0,0 +1,10 @@
 import cv2
 import time
 import numpy as np
 def detect_stuffs(frame, net, ln, confidence, threshold, W, H):
    return idxs, start, end