检测物体+估计距离和方向 | 附源码

检测物体+估计距离和方向 | 附源码 愚人陆陆 2023-12-18 18:38:45 710

在这篇文章中,演示了如何使用计算机视觉创建一个应用程序,用于从语音命令中检测物体,估算物体的大致距离,并利用位置信息改善盲人的生活。这个项目的主要目标是处理实时数据,类似于Meta和Envision等可穿戴技术,以增强用户的生活和改善他们的日常体验。

本教程涵盖的步骤如下:

导入库并定义类参数。

语音识别和处理函数定义。

从返回的参数中检测物体,找到其位置,计算平均距离,发送通知。

导入库并定义类参数

  • 导入 “speech_recognition” 库以从麦克风捕获音频并将语音转换为文本
  • 导入 cv2 库以捕获网络摄像头的视频并对其应用各种操作
  • 导入 Numpy 用于数学运算
  • 导入 Ultralytics 库以使用预训练的 YOLOv8 模型
  • 导入 pyttsx3 以进行文本到语音转换
  • 导入 math 库用于三角计算和数学运算
import speech_recognition as sr
import cv2
import numpy as np
from ultralytics import YOLO
import pyttsx3
import math


class_names = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "telephone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"]

object_dimensions = {                         
    "bird" : "0.10",
    "cat" : "0.45",
    "backpack" : "0.55",
    "umbrella" : "0.50",
    "bottle" : "0.20",
    "wine glass" : "0.25",
    "cup" : "0.15",
    "fork" : "0.15",
    "knife" : "0.25",
    "spoon" : "0.15",
    "banana" : "0.20",
    "apple" : "0.07",
    "sandwich" : "0.20",
    "orange" : "0.08",
    "chair" : "0.50",
    "laptop" : "0.40",
    "mouse" : "0.10",
    "remote" : "0.20",
    "keyboard" : "0.30",
    "phone" : "0.15",
    "book" : "0.18",
    "toothbrush" : "0.16"
}

我将我的 YOLOv8 模型训练的 COCO 数据集中的类别存储在 ‘class_names’ 变量中,它们的平均维度存储在 ‘object_dimensions’ 变量中。考虑到这个应用将用于家庭环境,我选择了特定的物体。如果您想使用自己的数据集,您需要执行自定义的物体检测并相应修改这些变量。

语音识别和处理函数定义

为了创建一个通用函数,假设物体位于句子的末尾,从短语中捕获搜索的物体(如“我的书在哪里?”,“找书!”,“书。”),我定义了一个名为 ‘get_last_word’ 的函数。这个函数将返回句子中的最后一个词,也就是物体。

def get_last_word(sentence):
    words = sentence.split()
    return words[-1]

定义了一个名为 ‘voice_command’ 的函数,以语音命令返回要搜索的物体以及这个物体的平均实际尺寸。

def voice_command():
    recognizer = sr.Recognizer()

    with sr.Microphone() as source:
        print("Waiting for voice command...")
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    target_object = ""  
    real_width = 0.15  

    try:
        command = recognizer.recognize_google(audio, language="en-US")
        print("Recognized command:", command)
        last_word = get_last_word(command.lower())  
        if last_word:
            print("Last word:", last_word)

        target_object = last_word.lower()

        if target_object in object_dimensions:
            real_width = float(object_dimensions[target_object])
            print(real_width)
        else:
            print(f"No length information found for {target_object}, using the default value of 0.15.")
    except sr.UnknownValueError:
        print("Voice cannot be understood.")
    except sr.RequestError as e:
        print("Voice recognition error; {0}".format(e))

    return target_object, real_width

创建了一个名为 ‘voice_notification’ 的函数,用于用语音提醒用户。

def voice_notification(obj_name, direction, distance):
    engine = pyttsx3.init()
    text = "{} is at {}. It is {:.2f} meters away.".format(obj_name, direction, distance)
    engine.say(text)
    engine.runAndWait()

加载了 YOLOv8 模型,可以从 Ultralytics 网站下载和使用:https://docs.ultralytics.com/models/yolov8/#overview。

计算从语音命令接收的物体到摄像头的距离,并用语音通知最终用户物体在时钟上的位置方向。

def main():
    # Load the YOLO model
    model = YOLO("yolov8n.pt")

    # Get video frame dimensions for calculating 
    cap = cv2.VideoCapture(0)
    frame_width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  
    frame_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)  
    center_x = int(frame_width // 2)
    center_y = int(frame_height // 2)
    radius = min(center_x, center_y) - 30  # Radius of the circle where clock hands are drawn

    #The target object the user wants to search for via voice command and its real-world average size
    target_object, real_width = voice_command()

    while True:
        success, img = cap.read()

        # Predict objects using the YOLO model
        results = model.predict(img, stream=True)

        # Draw clock
        for i in range(1, 13):
            angle = math.radians(360 / 12 * i - 90)
            x = int(center_x + radius * math.cos(angle))
            y = int(center_y + radius * math.sin(angle))

            if i % 3 == 0:
                thickness = 3
                length = 20
            else:
                thickness = 1
                length = 10

            font = cv2.FONT_HERSHEY_SIMPLEX
            cv2.putText(img, str(i), (x - 10, y + 10), font, 0.5, (0, 255, 0), thickness)

        # detect and process objects recognized by model
        for r in results:
            boxes = r.boxes

            for box in boxes:
                x1, y1, x2, y2 = box.xyxy[0]
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)  

                cls = int(box.cls)

                if class_names[cls].lower() == target_object:
                    camera_width = x2 - x1
                    distance = (real_width * frame_width) / camera_width
                    #voice_notification(target_object)

                    obj_center_x = (x1 + x2) // 2
                    obj_center_y = (y1 + y2) // 2

                    camera_middle_x = frame_width // 2
                    camera_middle_y = frame_height // 2

                    vector_x = obj_center_x - camera_middle_x
                    vector_y = obj_center_y - camera_middle_y

                    angle_deg = math.degrees(math.atan2(vector_y, vector_x))
                    #direction = ''
                    if angle_deg < 0:
                        angle_deg += 360

                    if 0 <= angle_deg < 30:
                        direction = "3 o'clock"
                    elif 30 <= angle_deg < 60:
                        direction = "4 o'clock"
                    elif 60 <= angle_deg < 90:
                        direction = "5 o'clock"
                    elif 90 <= angle_deg < 120:
                        direction = "6 o'clock"
                    elif 120 <= angle_deg < 150:
                        direction = "7 o'clock"
                    elif 150 <= angle_deg < 180:
                        direction = "8 o'clock"
                    elif 180 <= angle_deg < 210:
                        direction = "9 o'clock"
                    elif 210 <= angle_deg < 240:
                        direction = "10 o'clock"
                    elif 240 <= angle_deg < 270:
                        direction = "11 o'clock"
                    elif 270 <= angle_deg < 300:
                        direction = "12 o'clock"
                    elif 300 <= angle_deg < 330:
                        direction = "1 o'clock"
                    elif 330 <= angle_deg < 360:
                        direction = "2 o'clock"
                    else:
                        direction = "Unknown Clock Position"

                    cv2.putText(img, direction, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    cv2.putText(img, "Distance: {:.2f} meters".format(distance), (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
                    cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)

                    if boxes is not None:

                        voice_notification(target_object, direction, distance)


        cv2.imshow("Webcam", img)

        k = cv2.waitKey(1)
        if k == ord("q"):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()

结论和建议

希望这篇文章演示了计算机视觉如何增强人类视力并使我们的生活更轻松,希望这个项目可以带给大家对新事物探索的更多启发。你也可以使用自己的数据集,进一步开发和定制这个项目以满足您的具体需求,并添加适合您目标的功能。例如,您可以使用光学字符识别(OCR)技术将任何文本转换为文本到语音,这可以非常有用。这可以帮助某人在商店中找到产品或听书,还有许多其他应用。

声明:本文内容由易百纳平台入驻作者撰写,文章观点仅代表作者本人,不代表易百纳立场。如有内容侵权或者其他问题,请联系本站进行删除。
红包 点赞 收藏 评论 打赏
评论
0个
内容存在敏感词
手气红包
    易百纳技术社区暂无数据
相关专栏
置顶时间设置
结束时间
删除原因
  • 广告/SPAM
  • 恶意灌水
  • 违规内容
  • 文不对题
  • 重复发帖
打赏作者
易百纳技术社区
愚人陆陆
您的支持将鼓励我继续创作!
打赏金额:
¥1易百纳技术社区
¥5易百纳技术社区
¥10易百纳技术社区
¥50易百纳技术社区
¥100易百纳技术社区
支付方式:
微信支付
支付宝支付
易百纳技术社区微信支付
易百纳技术社区
打赏成功!

感谢您的打赏,如若您也想被打赏,可前往 发表专栏 哦~

举报反馈

举报类型

  • 内容涉黄/赌/毒
  • 内容侵权/抄袭
  • 政治相关
  • 涉嫌广告
  • 侮辱谩骂
  • 其他

详细说明

审核成功

发布时间设置
发布时间:
是否关联周任务-专栏模块

审核失败

失败原因
备注
拼手气红包 红包规则
祝福语
恭喜发财,大吉大利!
红包金额
红包最小金额不能低于5元
红包数量
红包数量范围10~50个
余额支付
当前余额:
可前往问答、专栏板块获取收益 去获取
取 消 确 定

小包子的红包

恭喜发财,大吉大利

已领取20/40,共1.6元 红包规则

    易百纳技术社区