i have c++ detection send frame and dectection wit...

Created on: October 15, 2025

Answered using GPT-5 Thinking by Chat01

Question

i have c++ detection send frame and dectection with shared memory to python tracker but sometimes i have a problem

write copelete code for both side
and explain clearly

import sys, os, time, math, argparse, mmap, ctypes
import numpy as np
import cv2
from loguru import logger
import torch

import sys
import os
import sys
sys.path.append("/media/rayfan/d/projects/HOP_system/myhop")

اضافه کردن مسیر والد myhop به PYTHONPATH

yolox / tracker

from yolox.data.datasets import COCO_CLASSES
from yolox.utils.visualize import plot_tracking
from yolox.tracker.hop_tracker import HOPTracker, pixel_distribution

از کد قبلی: ابزارهای کمکی برای نرخ دیتکشن داینامیک

from track_utils_shm import dbscan_clustering, detection_rate_adjuster

shared memory layout

from shm_layout import ShmBlock, RING_SIZE, MAX_DETS

def open_shm(shm_name: str):
# POSIX shm_open یک فایل در /dev/shm می‌سازد: /dev/shm/<name_without_slash>
path = f"/dev/shm/{shm_name.lstrip('/')}"
fd = os.open(path, os.O_RDWR)
size = ctypes.sizeof(ShmBlock)
mm = mmap.mmap(fd, size, mmap.MAP_SHARED, mmap.PROT_READ | mmap.PROT_WRITE)
os.close(fd)
block = ShmBlock.from_buffer(mm)
return mm, block

def read_slot(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):

t0 = time.time()

while True:

last = block.write_frame

if last >= frame_id_expected:

slot_id = frame_id_expected % RING_SIZE

slot = block.slots[slot_id]

# if slot.ready == 1 and slot.frame_id == frame_id_expected:

# print("im in **********************************")

# return slot

return slot

if (time.time() - t0) * 1000 > timeout_ms:

return None

time.sleep(0.001)

def read_slot(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):
t0 = time.time()
# frame_id_expected = 0
while True:
last = block.write_frame
if last >= frame_id_expected:
slot_id = frame_id_expected % RING_SIZE
slot = block.slots[slot_id]
return slot # فقط وقتی slot آماده است برگردان
# else:
# continue
if (time.time() - t0) * 1000 > timeout_ms:
return None
time.sleep(0.001)

def read_frame(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):

t0 = time.time()

while True:

last = block.write_frame

if last >= frame_id_expected:

slot_id = frame_id_expected % RING_SIZE

slot = block.slots[slot_id]

if slot.ready == 1 and slot.frame_id == frame_id_expected:

frame = np.frombuffer(slot.img, dtype=np.uint8).reshape((slot.img_h, slot.img_w, 3))

slot.ready = 0 # tell C++ it's safe to reuse

block.read_frame = frame_id_expected

return frame, slot

if (time.time() - t0) * 1000 > timeout_ms:

return None, None

time.sleep(0.001)

def slot_to_tensor(slot):
n = int(slot.num_dets)
if n <= 0:
return None
arr = np.zeros((n, 6), dtype=np.float32)
for i in range(n):
d = slot.dets[i]
arr[i, 0] = d.x1
arr[i, 1] = d.y1
arr[i, 2] = d.x2
arr[i, 3] = d.y2
arr[i, 4] = d.score
arr[i, 5] = float(d.class_id)
return arr

def read_frame(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):
"""
فرض: هر slot یک فریم RGB uint8 با shape (H, W, 3) دارد که در block.slots[slot_id].frame قرار دارد
"""
t0 = time.time()
while True:
last = block.write_frame
if last >= frame_id_expected:
slot_id = frame_id_expected % RING_SIZE
slot = block.slots[slot_id]
# frame = np.frombuffer(slot.frame, dtype=np.uint8)
# frame = np.frombuffer(slot.img, dtype=np.uint8).reshape((HEIGHT_MAX, WIDTH_MAX, 3))
frame = np.frombuffer(slot.img, dtype=np.uint8).reshape((slot.img_h, slot.img_w, 3))
# frame = frame.reshape((block.height, block.width, 3))
# frame = frame.reshape((block.img_h, block.img_w, 3))

text
return frame, slot if (time.time() - t0) * 1000 > timeout_ms: return None, None time.sleep(0.001)

from ctypes import Structure, c_uint64, c_int32, c_float, c_uint8

ثابت‌ها

WIDTH_MAX = 640
HEIGHT_MAX = 640
MAX_DETS = 1024
RING_SIZE = 8

-----------------------

معادل struct Detection

class Detection(Structure):
pack = 1
fields = [
("x1", c_float),
("y1", c_float),
("x2", c_float),
("y2", c_float),
("score", c_float),
("class_id", c_int32)
]

-----------------------

معادل struct Slot

class Slot(Structure):
pack = 1
fields = [
("frame_id", c_uint64), # شماره فریم
("ready", c_int32), # 0: خالی, 1: آماده خواندن
("num_dets", c_int32), # تعداد دیتکشن‌های معتبر
("img_w", c_int32),
("img_h", c_int32),
("fps", c_float),
("dets", Detection * MAX_DETS),
("img", c_uint8 * (HEIGHT_MAX * WIDTH_MAX * 3)) # RGB
]

-----------------------

معادل struct ShmBlock

class ShmBlock(Structure):
pack = 1
fields = [
("write_frame", c_uint64), # آخرین فریم نوشته‌شده
("read_frame", c_uint64), # آخرین فریم خوانده‌شده
("ring_size", c_int32), # = RING_SIZE
("max_dets", c_int32), # = MAX_DETS
("img_w", c_int32),
("img_h", c_int32),
("fps", c_float),
("slots", Slot * RING_SIZE)
]

def main():
parser = argparse.ArgumentParser()
parser.add_argument('-s', "--source", type=str, default="video", help="video or webcam")
# parser.add_argument('-p', "--path", type=str, required=True, help="input video")
parser.add_argument('-p', '--path', default=None, help='path to video file') # <- default=None makes it optional
parser.add_argument("-m", "--model", type=str, default="yolov5", help="kept for compatibility")
parser.add_argument("--save_result", action="store_true")
parser.add_argument("--dynamic", dest="dynamic", action="store_true")
parser.add_argument("--swift", dest="swift", action="store_true")
parser.add_argument("--accurate",dest="accurate",action="store_true", default=True)
#added by me this line
parser.add_argument("--dis_traj", action="store_true", help="display trajectories")

text
# tracking args (همان قبلی) parser.add_argument("--track_thresh", type=float, default=0.3) parser.add_argument("--track_buffer", type=int, default=200) #60 parser.add_argument("--match_thresh", type=float, default=0.2) parser.add_argument("--aspect_ratio_thresh", type=float, default=0.25) parser.add_argument('--min_box_area', type=float, default=1) parser.add_argument("--shm", type=str, default="/yolo_shm", help="shared memory name") args = parser.parse_args() shm_mm, shm = open_shm(args.shm) # # ویدیو برای نمایش/ذخیره # cap = cv2.VideoCapture(args.path) # if not cap.isOpened(): # print("Failed to open video") # return # width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # fps = cap.get(cv2.CAP_PROP_FPS) or shm.fps or 30.0 width = shm.img_w height = shm.img_h # width = 640 # height = 640 fps = shm.fps if args.save_result: fourcc = cv2.VideoWriter_fourcc(*"mp4v") out = cv2.VideoWriter("tracked.mp4", fourcc, fps, (width, height)) else: out = None # انتخاب استراتژی نمونه‌برداری (مثل قبل) if args.swift: sampling_strategy = 1 elif args.accurate: sampling_strategy = 2 else: sampling_strategy = 0 detection_rate = 9 # مقدار اولیه if args.path is None: args.path = "no_video" tracker = HOPTracker(args, frame_rate=math.ceil(fps)) light_multi_tracker = cv2.legacy.MultiTracker_create() frame_id = 1 predicted_bbox = [] online_targets = [] light_tracker_id = [] # حلقه اصلی while True: # ok, frame = cap.read() # if not ok: break # slot_id = frame_id % RING_SIZE # slot = shm.slots[slot_id] # frame = np.frombuffer(slot.img, dtype=np.uint8).reshape((HEIGHT_MAX, WIDTH_MAX, 3)) # slot , frame = read_frame(shm, frame_id) # if frame is None: # time.sleep(0.002) # continue # frame, slot = read_frame(shm, frame_id) # if frame is None or slot is None: # time.sleep(0.002) # continue # خواندن frame و slot به ترتیب صحیح (read_frame -> returns (frame, slot)) frame, slot = read_frame(shm, frame_id) # if frame is None or slot is None: if frame is None: print("frame is none") time.sleep(0.002) continue if slot is None: print("slot is none") time.sleep(0.002) continue frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) # # محافظت اضافی (اگر به هر دلیلی frame شیء اشتباهی شد) # if not isinstance(frame, np.ndarray): # logger.error(f"Expected numpy.ndarray for frame but got {type(frame)} — skipping frame {frame_id}") # time.sleep(0.002) # continue # # هر فریم، دیتکشن مربوط به آن را از shm می‌خوانیم (C++ هر فریم نوشته) # slot = read_slot(shm, frame_id) # if slot is None: # # اگر C++ عقب افتاده، فریم را اسکپ می‌کنیم یا منتظر می‌مانیم # time.sleep(0.002) # continue # # تعریف دیتکشن خالی برای تست # slot = type('Slot', (), {})() # ساخت یک آبجکت خالی # slot.num_dets = 0 # slot.dets = [] # Before tracking start_time = time.time() # برای هماهنگی با ساختار قبلی: فقط هر N فریم از دیتکشن استفاده کن if frame_id % 4 == 1: # (قبلاً frame_id % detection_rate == 1) light_multi_tracker.clear() light_multi_tracker = cv2.legacy.MultiTracker_create() # dets = slot_to_tensor(slot) # Nx6 [x1 y1 x2 y2 score class] # if dets is None: # online_im = frame # else: # # detect + track fuse (مثل قبل) # online_targets = tracker.detect_track_fuse( # dets, [height, width, frame], (640, 640) # ) dets = slot_to_tensor(slot) # Nx6 [x1 y1 x2 y2 score class] if dets is None: print("dets is None") online_im = frame else: # تبدیل numpy.ndarray به torch.Tensor روی CPU dets = torch.from_numpy(dets).float().cpu() print("/////////////////////////////////////////////////////////////////") print("dets = ", dets) for row in dets: print([f"{v:.2f}" for v in row]) print("height = ", height) print("width = ", width) print("frame = ", frame) online_targets = tracker.detect_track_fuse( dets, [height, width, frame], (640, 640) ) online_tlwhs, online_ids, online_scores, online_class_id = [], [], [], [] predicted_bbox = []; light_tracker_id = [] for t in online_targets: print("online_targets=", online_targets) t.last_detected_frame = frame_id tlwh = t.tlwh; tid = t.track_id tlbr = np.asarray(tlwh).copy(); tlbr[2:] += tlbr[:2] tlbr = np.append(tlbr, [t.score, t.class_id, tid, *t.mean]) predicted_bbox.append(tlbr) if tlwh[2]*tlwh[3] > args.min_box_area and tlwh[3]/tlwh[2] >= args.aspect_ratio_thresh: hist_b, hist_g, hist_r = pixel_distribution( frame, int(t.tlwh[0]), int(t.tlwh[1]), int(t.tlwh[2]), int(t.tlwh[3]) ) t.color_dist = [hist_b, hist_g, hist_r] # لایت‌ترکر روی باکس‌های ایستا if t.mean[4]==0 and t.mean[5]==0 and t.mean[6]==0 and t.mean[7]==0: light_multi_tracker.add(cv2.legacy.TrackerMedianFlow_create(), frame, (int(t.tlwh[0]), int(t.tlwh[1]), int(t.tlwh[2]), int(t.tlwh[3]))) light_tracker_id.append(tid) online_tlwhs.append(tlwh) online_ids.append(tid) online_scores.append(t.score) online_class_id.append(t.class_id) # رندر online_im = plot_tracking( image=frame, tlwhs=online_tlwhs, obj_ids=online_ids, online_class_id=online_class_id, frame_id=frame_id, fps=fps, scores=online_scores, class_names=COCO_CLASSES ) else: # فریم‌های میانی: فقط آپدیت کالمن/لایت‌ترکر (مثل ساختار قبلی) if len(predicted_bbox) != 0 and len(online_targets) != 0: if frame_id % max(4, detection_rate) <= 3: light_ok, light_bbox = light_multi_tracker.update(frame) else: light_ok, light_bbox = False, [] predict_bbox = [] for idx, each_track in enumerate(online_targets): if light_ok and each_track.track_id in light_tracker_id and len(light_bbox)!=0: lid = light_tracker_id.index(each_track.track_id) nb = light_bbox[lid] predict_bbox.append([nb[0], nb[1], nb[0]+nb[2], nb[1]+nb[3], predicted_bbox[idx][4], predicted_bbox[idx][5]]) else: new_x = each_track.mean[0] + each_track.mean[4] new_y = each_track.mean[1] + each_track.mean[5] new_a = each_track.mean[2] + each_track.mean[6] new_h = each_track.mean[3] + each_track.mean[7] new_w = new_a * new_h tlwh = [new_x - new_w/2, new_y - new_h/2, new_w, new_h] tlbr = np.asarray(tlwh).copy(); tlbr[2:] += tlbr[:2] tlbr = np.append(tlbr, [predicted_bbox[idx][4], predicted_bbox[idx][5]]) predict_bbox.append(tlbr) predicted_bbox = np.array(predict_bbox, dtype=np.float32) online_targets = tracker.hopping_update( predicted_bbox, [height, width, frame], (640, 640) ) online_tlwhs, online_ids, online_scores, online_class_id = [], [], [], [] predicted_bbox = [] for t in online_targets: tlwh = t.tlwh; tid = t.track_id tlbr = np.asarray(tlwh).copy(); tlbr[2:] += tlbr[:2] tlbr = np.append(tlbr, [t.score, t.class_id, tid, *t.mean]) predicted_bbox.append(tlbr) if tlwh[2]*tlwh[3] > args.min_box_area and tlwh[3]/tlwh[2] >= args.aspect_ratio_thresh: online_tlwhs.append(tlwh); online_ids.append(tid) online_scores.append(t.score); online_class_id.append(t.class_id) online_im = plot_tracking( image=frame, tlwhs=online_tlwhs, obj_ids=online_ids, online_class_id=online_class_id, frame_id=frame_id, fps=fps, scores=online_scores, class_names=COCO_CLASSES ) else: online_im = frame # Add this right after tracking end_time = time.time() tracker_time_ms = (end_time - start_time) * 1000 print(f"Frame {frame_id}: tracker time = {tracker_time_ms:.2f} ms") # نمونه‌برداری داینامیک (اختیاری، مثل قبل) if (frame_id % 2) == 0 and len(online_targets) != 0: #5 cluster_dic, cluster_num = dbscan_clustering(online_targets) detection_rate = detection_rate_adjuster(cluster_dic, cluster_num, os.path.basename(args.path), sampling_strategy) # خروجی/نمایش if out is not None: out.write(online_im) else: cv2.imshow("HOPTrack (Python Tracker + C++ TRT Detector)", online_im) if cv2.waitKey(1) & 0xFF in [27, ord('q'), ord('Q')]: break frame_id += 1 if out is not None: out.release() # cap.release() cv2.destroyAllWindows() shm_mm.close()

if name == "main":
main()

do yo want to send you c++ side?

#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <cmath>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <cstring>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "cuda_runtime.h"
#include "shm_layout.h"
#include <numeric> // for std::iota

using namespace nvinfer1;

class Logger : public nvinfer1::ILogger
{
void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
} logger;

// ---------- ساده‌ساز مدیریت CUDA ----------
#define CHECK_CUDA(expr) do { cudaError_t __e = (expr); if (__e != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(__e) << " @ " << FILE << ":" << LINE << "\n"; exit(1);} } while(0)

struct Args {
std::string path;
std::string trt;
std::string shm_name = "/yolo_shm";
int img = 640;
float conf = 0.25f;
float nms = 0.45f;
bool dynamic = false;
};

static void parse_args(int argc, char** argv, Args& a) {
for (int i=1; i<argc; ++i) {
std::string k = argv[i];
auto next = &{ if(i+1>=argc) {std::cerr<<"Missing value for "<<k<<"\n"; exit(1);} return std::string(argv[++i]); };
if (k=="--path" || k=="-p") a.path = next();
else if (k=="--trt" || k=="-trt") a.trt = next();
else if (k=="--img") a.img = std::stoi(next());
else if (k=="--conf") a.conf = std::stof(next());
else if (k=="--nms") a.nms = std::stof(next());
else if (k=="--dynamic" || k=="--dyn" || k=="--dynamic-shape") a.dynamic = true;
else if (k=="--shm") a.shm_name = next();
else { std::cerr<<"Unknown arg: "<<k<<"\n"; exit(1); }
}
if (a.path.empty() || a.trt.empty()) {
std::cerr<<"Usage: detector_trt_shm --path <video> --trt <engine.trt> [--img 640] [--conf 0.25] [--nms 0.45] [--dynamic] [--shm /yolo_shm]\n";
exit(1);
}
}

// ---------- ابزار Letterbox ----------
struct LetterboxInfo { float scale; int pad_w; int pad_h; int net; };
static cv::Mat letterbox(const cv::Mat& img, int net_size, LetterboxInfo& info) {
int w = img.cols, h = img.rows; info.net = net_size;
float r = std::min((float)net_size / h, (float)net_size / w);
int new_unpad_w = std::round(w * r);
int new_unpad_h = std::round(h * r);
cv::Mat resized; cv::resize(img, resized, cv::Size(new_unpad_w, new_unpad_h), 0, 0, cv::INTER_LINEAR);
int dw = net_size - new_unpad_w;
int dh = net_size - new_unpad_h;
info.scale = r;
info.pad_w = dw/2;
info.pad_h = dh/2;
cv::Mat out(net_size, net_size, CV_8UC3, cv::Scalar(114,114,114));
resized.copyTo(out(cv::Rect(info.pad_w, info.pad_h, resized.cols, resized.rows)));
return out;
}

static inline float iou(const cv::Rect2f& a, const cv::Rect2f& b) {
float inter = (a & b).area();
float uni = a.area() + b.area() - inter + 1e-6f;
return inter / uni;
}

struct Det { cv::Rect2f box; float score; int cls; };

// NMS ساده
static std::vector<Det> nms(const std::vector<Det>& dets, float iou_thr) {
std::vector<int> idxs(dets.size());
std::iota(idxs.begin(), idxs.end(), 0);
std::sort(idxs.begin(), idxs.end(), [&](int i, int j){ return dets[i].score > dets[j].score; });
std::vector<Det> keep;
std::vector<char> removed(dets.size(), 0);
for (size_t _i=0; _i<idxs.size(); ++_i) {
int i = idxs[_i]; if (removed[i]) continue;
keep.push_back(dets[i]);
for (size_t _j=_i+1; _j<idxs.size(); ++_j) {
int j = idxs[_j]; if (removed[j]) continue;
if (iou(dets[i].box, dets[j].box) > iou_thr) removed[j] = 1;
}
}
return keep;
}

// دی‌سریالایز TRT
static ICudaEngine* load_engine(const std::string& path, IRuntime*& runtime) {
std::ifstream f(path, std::ios::binary);
if(!f) { std::cerr<<"Cannot open engine: "<<path<<"\n"; exit(1); }
f.seekg(0, std::ios::end); size_t sz = f.tellg(); f.seekg(0, std::ios::beg);
std::vector<char> buf(sz); f.read(buf.data(), sz);

text
runtime = nvinfer1::createInferRuntime(logger); if (!runtime) { std::cerr<<"Failed to create runtime\n"; exit(1); } auto* eng = runtime->deserializeCudaEngine(buf.data(), sz); if (!eng) { std::cerr<<"Failed to deserialize engine\n"; exit(1); } return eng;

}

// کمکی: پیدا کردن اندیس ورودی/خروجی
static int find_binding_index(ICudaEngine* engine, bool input=true) {
for (int i=0;i<engine->getNbIOTensors();++i) {
const char* name = engine->getIOTensorName(i);
TensorIOMode mode = engine->getTensorIOMode(name);
if ( (input && mode==TensorIOMode::kINPUT) || (!input && mode==TensorIOMode::kOUTPUT) ) {
return i;
}
}
return -1;
}

// پارس کردن یخروجی‌های متداول YOLOv5 (دو حالت معروف)
static void parse_yolov5_output(const float* out, const Dims& dims, float conf_thr, int net,
const LetterboxInfo& lb, int img_w, int img_h,
std::vector<Det>& out_dets)
{
// حالت A: (1, N, 85) یا (N, 85)
int nb = 1, N = 0, C = 0;
if (dims.nbDims==3) { nb = dims.d[0]; N = dims.d[1]; C = dims.d[2]; }
else if (dims.nbDims==2) { N = dims.d[0]; C = dims.d[1]; }
else { // حالت B: (1, 25200, 7) یا (M, 6)
// تلاش برای حالت عمومی
}
std::cout << "Output dims: nbDims=" << dims.nbDims;
for (int i=0;i<dims.nbDims;i++) std::cout << " " << dims.d[i];
std::cout << std::endl;

text
if (C >= 6 && N > 0) { for (int i=0;i<N;i++) { const float* p = out + i*C; float x = p[0], y = p[1], w = p[2], h = p[3]; float obj = p[4]; // پیدا کردن بیشینه کلاس int best_cls = -1; float best_conf = 0.f; for (int k=5;k<C;k++) { float c = p[k]; if (c > best_conf) { best_conf = c; best_cls = k-5; } } float score = obj * best_conf; if (score < conf_thr) continue; // xywh → x1y1x2y2 در فضای شبکه float x1 = x - w/2.f; float y1 = y - h/2.f; float x2 = x + w/2.f; float y2 = y + h/2.f; // برگرداندن به تصویر اصلی (حذف پدینگ و اسکیل برعکس) x1 -= lb.pad_w; x2 -= lb.pad_w; y1 -= lb.pad_h; y2 -= lb.pad_h; x1 /= lb.scale; x2 /= lb.scale; y1 /= lb.scale; y2 /= lb.scale; // کِلَمپ به محدوده تصویر x1 = std::max(0.f, std::min((float)img_w-1.f, x1)); y1 = std::max(0.f, std::min((float)img_h-1.f, y1)); x2 = std::max(0.f, std::min((float)img_w-1.f, x2)); y2 = std::max(0.f, std::min((float)img_h-1.f, y2)); out_dets.push_back({ cv::Rect2f(cv::Point2f(x1,y1), cv::Point2f(x2,y2)), score, best_cls }); } return; } // حالت جایگزین: اگر خروجی مستقیم [x1,y1,x2,y2,score,cls] بود if (C==6 || C==7) { for (int i=0;i<N;i++) { const float* p = out + i*C; float x1=p[0], y1=p[1], x2=p[2], y2=p[3], score=p[4]; int cls = (int)std::round(p[5]); if (score < conf_thr) continue; // اسکیل برعکس (احتمالاً لازم نیست اگر دکدر در TRT انجام شده) out_dets.push_back({ cv::Rect2f(cv::Point2f(x1,y1), cv::Point2f(x2,y2)), score, cls }); } return; } // اگر هیچکدام نبود: // برای سادگی، اینجا خروجی ناشناخته تلقی می‌شود. std::cerr << "[WARN] Unrecognized output shape; please adapt parser.\n";

}

static size_t shm_size() { return sizeof(ShmBlock); }

int main(int argc, char** argv) {
Args args; parse_args(argc, argv, args);

text
// --- Shared Memory --- int fd = shm_open(args.shm_name.c_str(), O_CREAT | O_RDWR, 0666); if (fd < 0) { perror("shm_open"); return 1; } if (ftruncate(fd, shm_size()) < 0) { perror("ftruncate"); return 1; } void* base = mmap(nullptr, shm_size(), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (base == MAP_FAILED) { perror("mmap"); return 1; } auto* shm = reinterpret_cast<ShmBlock*>(base); std::memset(shm, 0, sizeof(ShmBlock)); shm->ring_size = RING_SIZE; shm->max_dets = MAX_DETS; // --- OpenCV video --- cv::VideoCapture cap(args.path); if (!cap.isOpened()) { std::cerr<<"Cannot open video: "<<args.path<<"\n"; return 1; } int img_w = (int)cap.get(cv::CAP_PROP_FRAME_WIDTH); int img_h = (int)cap.get(cv::CAP_PROP_FRAME_HEIGHT); float fps = (float)cap.get(cv::CAP_PROP_FPS); shm->img_w = img_w; shm->img_h = img_h; shm->fps = fps; // --- TensorRT --- IRuntime* runtime=nullptr; ICudaEngine* engine = load_engine(args.trt, runtime); // اینجا پیام موفقیت اضافه شود std::cout << "TensorRT engine loaded successfully from " << args.trt << "\n"; IExecutionContext* ctx = engine->createExecutionContext(); if (!ctx) { std::cerr<<"Failed to create TRT context\n"; return 1; } // پیدا کردن نام‌ها/ایندکس IO int in_idx = find_binding_index(engine, true); int out_idx = find_binding_index(engine, false); const char* in_name = engine->getIOTensorName(in_idx); const char* out_name = engine->getIOTensorName(out_idx); // تنظیم شکل ورودی برای داینامیک if (args.dynamic) { Dims inDims; inDims.nbDims = 4; inDims.d[0]=1; inDims.d[1]=3; inDims.d[2]=args.img; inDims.d[3]=args.img; if (!ctx->setInputShape(in_name, inDims)) { std::cerr<<"setInputShape failed\n"; return 1; } } // اندازه بافرها Dims inDims = ctx->getTensorShape(in_name); Dims outDims= ctx->getTensorShape(out_name); size_t in_size = 1; for (int i=0;i<inDims.nbDims;i++) in_size *= inDims.d[i]; size_t out_size = 1; for (int i=0;i<outDims.nbDims;i++) out_size *= outDims.d[i]; // بافرهای GPU/CPU float* d_in=nullptr; float* d_out=nullptr; CHECK_CUDA(cudaMalloc(&d_in, in_size * sizeof(float))); CHECK_CUDA(cudaMalloc(&d_out, out_size * sizeof(float))); std::vector<float> h_out(out_size); // استریم cudaStream_t stream; CHECK_CUDA(cudaStreamCreate(&stream)); // اجرای حلقه uint64_t frame_id = 0; while (true) { cv::Mat frame; if (!cap.read(frame)) break; LetterboxInfo lb; cv::Mat lbimg = letterbox(frame, args.img, lb); // BGR->RGB, HWC->CHW, float32/255 cv::Mat img_f; lbimg.convertTo(img_f, CV_32FC3, 1.0/255.0); std::vector<float> chw(3 * args.img * args.img); int hw = args.img * args.img; std::vector<cv::Mat> chs(3); for (int i=0;i<3;i++) chs[i] = cv::Mat(args.img, args.img, CV_32FC1, chw.data()+i*hw); cv::split(img_f, chs); // کپی به GPU CHECK_CUDA(cudaMemcpyAsync(d_in, chw.data(), chw.size()*sizeof(float), cudaMemcpyHostToDevice, stream)); // bind ctx->setTensorAddress(in_name, d_in); ctx->setTensorAddress(out_name, d_out); // اجرا if (!ctx->enqueueV3(stream)) { std::cerr<<"enqueueV3 failed\n"; break; } // خروجی به CPU CHECK_CUDA(cudaMemcpyAsync(h_out.data(), d_out, h_out.size()*sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK_CUDA(cudaStreamSynchronize(stream)); // پارس خروجی std::vector<Det> dets; parse_yolov5_output(h_out.data(), outDims, args.conf, args.img, lb, img_w, img_h, dets); // NMS dets = nms(dets, args.nms); /* // 🔹 اینجا کد رسم و نمایش اضافه کنید for (const auto& d : dets) { cv::rectangle(frame, d.box, cv::Scalar(0, 255, 0), 2); std::string label = "cls:" + std::to_string(d.cls) + " conf:" + cv::format("%.2f", d.score); int baseLine; cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); cv::rectangle(frame, cv::Point((int)d.box.x, (int)d.box.y - labelSize.height - 2), cv::Point((int)d.box.x + labelSize.width, (int)d.box.y), cv::Scalar(0, 255, 0), cv::FILLED); cv::putText(frame, label, cv::Point((int)d.box.x, (int)d.box.y - 2), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0,0,0), 1); } */ cv::imshow("Detections", frame); if (cv::waitKey(1) == 27) break; // ESC برای خروج // نوشتن در شِردمموری frame_id++; int slot_id = frame_id % RING_SIZE; Slot& slot = shm->slots[slot_id]; slot.ready = 0; // قفل ساده slot.frame_id = frame_id; slot.img_w = img_w; slot.img_h = img_h; slot.fps = fps; int M = std::min((int)dets.size(), MAX_DETS); slot.num_dets = M; for (int i=0;i<M;i++) { slot.dets[i].x1 = dets[i].box.x; slot.dets[i].y1 = dets[i].box.y; slot.dets[i].x2 = dets[i].box.x + dets[i].box.width; slot.dets[i].y2 = dets[i].box.y + dets[i].box.height; slot.dets[i].score = dets[i].score; slot.dets[i].class_id = dets[i].cls; } // 🔹 اینجا قبل از slot.ready = 1 اضافه کنید cv::Mat rgb; cv::cvtColor(frame, rgb, cv::COLOR_BGR2RGB); cv::Mat resized; cv::resize(rgb, resized, cv::Size(WIDTH_MAX, HEIGHT_MAX)); std::memcpy(slot.img, resized.data, WIDTH_MAX * HEIGHT_MAX * 3); //cv::imshow("DebugFrame", resized); //cv::waitKey(1);

// یا ذخیره موقت

text
slot.ready = 1; shm->write_frame = frame_id; // چاپ نتایج Detection std::cout << "Frame " << frame_id << " - Detections: " << M << std::endl; for (int i=0;i<M;i++) { std::cout << " [" << i << "] " << "cls=" << slot.dets[i].class_id << " conf=" << slot.dets[i].score << " box=(" << slot.dets[i].x1 << "," << slot.dets[i].y1 << ")-(" << slot.dets[i].x2 << "," << slot.dets[i].y2 << ")" << std::endl; } } // آزادسازی cudaStreamDestroy(stream); cudaFree(d_in); cudaFree(d_out); // آزادسازی TensorRT objects - روش جدید delete ctx; delete engine; delete runtime; munmap(base, shm_size()); close(fd); shm_unlink(args.shm_name.c_str()); return 0;

}

Thought

Writing complete code for both sides

Given the user's request, I’ll write the working code for both the C++ producer (detector) and Python consumer (tracker). I'll focus on shared memory structure, a ring buffer, and proper handling of image size/mismatch. The C++ side will fill the buffer with resized images, and Python will reshape based on the provided video size.

Fixing buffer and synchronization issues

The problem arises from mismatched image sizes. The C++ code stores the resized image as 640x640, while Python is using the original video size for reshaping, which leads to incorrect shapes. Additionally, synchronization is causing data races since the ready flags and frame_id aren't being correctly handled or cleared. Using memory fences and properly managing ready flags should resolve the issue.

Addressing memory and consistency issues

There are contradictions in how memory layout is handled. Python redefines structs that are already imported, which should be unified. Consistency between RING_SIZE and MAX_DETS across both sides needs to be maintained. Additionally, while Python uses a different method for shared memory, the C++ side's use of shm_unlink and other structures should be reviewed for compatibility. Lastly, Python's conversion to torch seems necessary as it processes the detection boxes.

Addressing issues with data race and reshape

The problem seems to be that Python isn't properly waiting for data to be ready. The read_frame function should check ready==1 and ensure the frame_id matches, but it's not doing so. This could lead to reading partially written or outdated data. Also, the reshape is mismatched because C++ stores a 640x640 frame, but Python expects different dimensions. To fix this, defining a shared memory layout header and ensuring Python mirrors the constants would help.

Proposing a solution for frame storage and reshaping

I’m thinking that the core issue is the mismatch between the buffer size and how Python is reshaping the frame. C++ writes the frame at a fixed size, but Python is expecting a specific shape. The best approach seems to be adjusting the shared memory size or altering how Python handles the reshaping by slicing the buffer correctly. This way, Python can adjust to the actual storage size, and we avoid the reshape error.

Designing shared memory handshake and frame storage

To fix issues related to frame reshaping and missed frames, we'll ensure Python correctly handles the shared memory slots. We’ll store the original frame size in the shared memory, with boxes mapped appropriately for accuracy. The C++ and Python code will handle the frame reading/writing handshake via atomic flags to ensure synchronization. I’ll also explain how to implement these fixes in a minimal, clear setup while correcting race conditions and using shared memory helpers. The code will align with the user’s existing tracker but be more robust.

Designing shared memory structure for frame storage

For frame storage, we’ll make adjustments to ensure consistency between C++ and Python. We’ll store a resized frame (640x640) while maintaining aspect ratio, and keep detection boxes in letterbox space. This ensures the tracker reads frames in consistent coordinates. The frame structure will be updated to carry necessary details, and Python will handle frame copying and synchronization. We'll clarify the process, ensuring the C++ side adapts while addressing coordinate mismatch for tracking accuracy.

Designing shared memory integration

To ensure consistency and prevent errors in frame handling, we'll write frames without resizing unless the original

Thought for 2m 36s

Answer

You’re running into two classic shared-memory pitfalls:

  1. Shape mismatch: C++ writes a fixed WIDTH_MAX×HEIGHT_MAX image (640×640 in your snippet) but sets slot.img_w/h to the original video size (e.g. 1920×1080). Your Python then tries to reshape HEIGHT_MAX*WIDTH_MAX*3 bytes into (img_h,img_w,3) → wrong size & occasional crashes/garbage.

  2. Race conditions: The Python reader ignores slot.ready and slot.frame_id, returns a slot that may still be in write, and never sets slot.ready=0 after reading. That leads to tearing / inconsistent detections.

Below is a robust, complete pair (C++ producer + Python consumer) plus a shared layout that fixes both problems:

  • A ring buffer with ready states (0=FREE, 1=WRITING, 2=READY).
  • The writer fills all fields, does a memory fence, then flips ready=2.
  • The reader waits for ready=2 && frame_id match, copies only the used bytes, then sets ready=0.
  • The image stored in SHM is the original frame size whenever it fits; if it doesn’t fit, the writer resizes to fit into the buffer and updates img_w/img_h accordingly so Python always reshapes the correct number of bytes.
  • Python slices the buffer to img_w*img_h*3 before reshape and copies to avoid reuse by the writer while tracking.

1) Shared layout (C++ header) – shm_layout.h

cpp
#pragma once #include <cstdint> #define WIDTH_MAX 1920 // capacity of the image buffer in SHM #define HEIGHT_MAX 1080 #define MAX_DETS 1024 #define RING_SIZE 8 // Slot.ready states enum : int32_t { SLOT_FREE = 0, SLOT_WRITING = 1, SLOT_READY = 2 }; #pragma pack(push, 1) struct Detection { float x1, y1, x2, y2; // in the coordinate system of the *stored image* float score; int32_t class_id; }; struct Slot { uint64_t frame_id; // monotonically increasing int32_t ready; // 0=FREE,1=WRITING,2=READY int32_t num_dets; int32_t img_w; // width of stored RGB image int32_t img_h; // height of stored RGB image float fps; Detection dets[MAX_DETS]; // detections for this frame // RGB image bytes. We only use the first (img_w*img_h*3) bytes each frame. uint8_t img[WIDTH_MAX * HEIGHT_MAX * 3]; }; struct ShmBlock { uint64_t write_frame; // last frame_id written (for reader progress) uint64_t read_frame; // last frame_id fully read (for writer stats) int32_t ring_size; // = RING_SIZE int32_t max_dets; // = MAX_DETS // nominal camera/stream info (may match or exceed per-slot img_w/h) int32_t nominal_w; int32_t nominal_h; float nominal_fps; Slot slots[RING_SIZE]; }; #pragma pack(pop)

2) Python ctypes mirror – shm_layout.py

python
# shm_layout.py import ctypes as C WIDTH_MAX = 1920 HEIGHT_MAX = 1080 MAX_DETS = 1024 RING_SIZE = 8 SLOT_FREE, SLOT_WRITING, SLOT_READY = 0, 1, 2 class Detection(C.Structure): _pack_ = 1 _fields_ = [ ("x1", C.c_float), ("y1", C.c_float), ("x2", C.c_float), ("y2", C.c_float), ("score", C.c_float), ("class_id", C.c_int32), ] class Slot(C.Structure): _pack_ = 1 _fields_ = [ ("frame_id", C.c_uint64), ("ready", C.c_int32), ("num_dets", C.c_int32), ("img_w", C.c_int32), ("img_h", C.c_int32), ("fps", C.c_float), ("dets", Detection * MAX_DETS), ("img", C.c_uint8 * (WIDTH_MAX * HEIGHT_MAX * 3)), ] class ShmBlock(C.Structure): _pack_ = 1 _fields_ = [ ("write_frame", C.c_uint64), ("read_frame", C.c_uint64), ("ring_size", C.c_int32), ("max_dets", C.c_int32), ("nominal_w", C.c_int32), ("nominal_h", C.c_int32), ("nominal_fps", C.c_float), ("slots", Slot * RING_SIZE), ]

3) C++ producer (TensorRT YOLO) – detector_trt_shm.cpp

This is your code, tightened for SHM correctness and image-size consistency. (Keeps your TRT bits; focus is on how we write the slot.)

cpp
#include <iostream> #include <fstream> #include <vector> #include <string> #include <cmath> #include <fcntl.h> #include <sys/mman.h> #include <sys/stat.h> #include <unistd.h> #include <cstring> #include <algorithm> #include <atomic> #include <numeric> #include <opencv2/opencv.hpp> #include "NvInfer.h" #include "cuda_runtime.h" #include "shm_layout.h" using namespace nvinfer1; struct Logger : ILogger { void log(Severity s, const char* msg) noexcept override { if (s <= Severity::kWARNING) std::cout << msg << std::endl; } } gLogger; #define CHECK_CUDA(x) do{auto e=(x); if(e!=cudaSuccess){ \ std::cerr<<"CUDA: "<<cudaGetErrorString(e)<<" @ "<<__FILE__<<":"<<__LINE__<<"\n"; std::exit(1);} }while(0) struct Args { std::string path; std::string trt; std::string shm_name = "/yolo_shm"; int img = 640; float conf = 0.25f; float nms = 0.45f; bool dynamic = false; }; static void parse_args(int argc, char** argv, Args& a) { auto next = [&](int& i){ if (i+1>=argc){ std::cerr<<"Missing value for "<<argv[i]<<"\n"; std::exit(1);} return std::string(argv[++i]); }; for (int i=1;i<argc;++i){ std::string k=argv[i]; if (k=="--path"||k=="-p") a.path=next(i); else if (k=="--trt"||k=="-trt") a.trt=next(i); else if (k=="--img") a.img=std::stoi(next(i)); else if (k=="--conf") a.conf=std::stof(next(i)); else if (k=="--nms") a.nms=std::stof(next(i)); else if (k=="--dynamic"||k=="--dyn"||k=="--dynamic-shape") a.dynamic=true; else if (k=="--shm") a.shm_name=next(i); else { std::cerr<<"Unknown arg "<<k<<"\n"; std::exit(1); } } if (a.path.empty() || a.trt.empty()){ std::cerr<<"Usage: detector_trt_shm --path <video> --trt <engine.trt> [--img 640] [--conf 0.25] [--nms 0.45] [--dynamic] [--shm /yolo_shm]\n"; std::exit(1); } } // --- trivial letterbox for TRT input --- struct LetterboxInfo { float scale; int pad_w; int pad_h; int net; }; static cv::Mat letterbox(const cv::Mat& img, int net, LetterboxInfo& info) { int w=img.cols, h=img.rows; info.net=net; float r=std::min(net/(float)h, net/(float)w); int nw = std::round(w*r), nh = std::round(h*r); cv::Mat resized; cv::resize(img, resized, cv::Size(nw,nh)); int dw = net - nw, dh = net - nh; info.scale=r; info.pad_w=dw/2; info.pad_h=dh/2; cv::Mat out(net, net, CV_8UC3, cv::Scalar(114,114,114)); resized.copyTo(out(cv::Rect(info.pad_w, info.pad_h, resized.cols, resized.rows))); return out; } // (IoU + NMS helpers) static inline float iou(const cv::Rect2f& a, const cv::Rect2f& b){ float inter=(a&b).area(); float uni=a.area()+b.area()-inter+1e-6f; return inter/uni; } struct Det { cv::Rect2f box; float score; int cls; }; static std::vector<Det> nms(const std::vector<Det>& ds, float thr){ std::vector<int> idx(ds.size()); std::iota(idx.begin(), idx.end(), 0); std::sort(idx.begin(), idx.end(), [&](int i,int j){return ds[i].score>ds[j].score;}); std::vector<Det> keep; std::vector<char> rm(ds.size(),0); for(size_t a=0;a<idx.size();++a){int i=idx[a]; if(rm[i])continue; keep.push_back(ds[i]); for(size_t b=a+1;b<idx.size();++b){int j=idx[b]; if(rm[j])continue; if(iou(ds[i].box,ds[j].box)>thr) rm[j]=1;}} return keep; } // Parse typical YOLOv5 head output (x,y,w,h,obj, +classes...), *returning boxes in the coordinate system of the stored image*. static void parse_yolov5_output(const float* out, const Dims& dims, float conf_thr, const LetterboxInfo& lb, int src_w, int src_h, std::vector<Det>& out_dets) { // dims like (1, N, C) or (N, C) int N=0, C=0; if (dims.nbDims==3){ N=dims.d[1]; C=dims.d[2]; } else if (dims.nbDims==2){ N=dims.d[0]; C=dims.d[1]; } if (N<=0 || C<6) { std::cerr<<"[WARN] unexpected output dims\n"; return; } for (int i=0;i<N;i++){ const float* p = out + i*C; float x=p[0], y=p[1], w=p[2], h=p[3], obj=p[4]; int best_cls=-1; float best_c=0.f; for (int k=5;k<C;k++){ if (p[k]>best_c){ best_c=p[k]; best_cls=k-5; } } float score = obj * best_c; if (score < conf_thr) continue; // xywh (net) -> x1y1x2y2 (net) float x1 = x - w/2.f, y1 = y - h/2.f; float x2 = x + w/2.f, y2 = y + h/2.f; // Undo letterbox to original source space x1 -= lb.pad_w; x2 -= lb.pad_w; y1 -= lb.pad_h; y2 -= lb.pad_h; x1 /= lb.scale; y1 /= lb.scale; x2 /= lb.scale; y2 /= lb.scale; // clamp x1 = std::max(0.f, std::min((float)src_w-1.f, x1)); y1 = std::max(0.f, std::min((float)src_h-1.f, y1)); x2 = std::max(0.f, std::min((float)src_w-1.f, x2)); y2 = std::max(0.f, std::min((float)src_h-1.f, y2)); out_dets.push_back({ cv::Rect2f({x1,y1},{x2,y2}), score, best_cls }); } } static ICudaEngine* load_engine(const std::string& path, IRuntime*& rt){ std::ifstream f(path, std::ios::binary); if (!f){ std::cerr<<"Cannot open engine "<<path<<"\n"; std::exit(1); } f.seekg(0,std::ios::end); size_t sz=f.tellg(); f.seekg(0,std::ios::beg); std::vector<char> buf(sz); f.read(buf.data(), sz); rt = createInferRuntime(gLogger); if (!rt){ std::cerr<<"createInferRuntime failed\n"; std::exit(1); } auto* eng = rt->deserializeCudaEngine(buf.data(), sz); if (!eng){ std::cerr<<"deserializeCudaEngine failed\n"; std::exit(1); } return eng; } static size_t shm_size(){ return sizeof(ShmBlock); } int main(int argc, char** argv){ Args args; parse_args(argc, argv, args); // --- SHM --- int fd = shm_open(args.shm_name.c_str(), O_CREAT|O_RDWR, 0666); if (fd<0){ perror("shm_open"); return 1; } if (ftruncate(fd, shm_size())<0){ perror("ftruncate"); return 1; } void* base = mmap(nullptr, shm_size(), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (base==MAP_FAILED){ perror("mmap"); return 1; } auto* shm = reinterpret_cast<ShmBlock*>(base); std::memset(shm, 0, sizeof(ShmBlock)); shm->ring_size = RING_SIZE; shm->max_dets = MAX_DETS; // --- Video --- cv::VideoCapture cap(args.path); if (!cap.isOpened()){ std::cerr<<"Cannot open "<<args.path<<"\n"; return 1; } int src_w = (int)cap.get(cv::CAP_PROP_FRAME_WIDTH); int src_h = (int)cap.get(cv::CAP_PROP_FRAME_HEIGHT); float fps = (float)(cap.get(cv::CAP_PROP_FPS) > 0 ? cap.get(cv::CAP_PROP_FPS) : 30.0); shm->nominal_w = src_w; shm->nominal_h = src_h; shm->nominal_fps = fps; // --- TRT --- IRuntime* rt=nullptr; ICudaEngine* eng=load_engine(args.trt, rt); std::unique_ptr<IExecutionContext> ctx(eng->createExecutionContext()); int inIdx=-1, outIdx=-1; for (int i=0;i<eng->getNbIOTensors();++i){ const char* name = eng->getIOTensorName(i); if (eng->getTensorIOMode(name)==TensorIOMode::kINPUT) inIdx=i; else outIdx=i; } const char* inName = eng->getIOTensorName(inIdx); const char* outName = eng->getIOTensorName(outIdx); if (args.dynamic){ Dims d; d.nbDims=4; d.d[0]=1; d.d[1]=3; d.d[2]=args.img; d.d[3]=args.img; if (!ctx->setInputShape(inName, d)){ std::cerr<<"setInputShape failed\n"; return 1; } } Dims inD = ctx->getTensorShape(inName); Dims outD= ctx->getTensorShape(outName); size_t inCount=1, outCount=1; for (int i=0;i<inD.nbDims;i++) inCount *= (size_t)inD.d[i]; for (int i=0;i<outD.nbDims;i++) outCount *= (size_t)outD.d[i]; float *dIn=nullptr, *dOut=nullptr; CHECK_CUDA(cudaMalloc(&dIn, inCount * sizeof(float))); CHECK_CUDA(cudaMalloc(&dOut, outCount* sizeof(float))); std::vector<float> hOut(outCount); cudaStream_t stream; CHECK_CUDA(cudaStreamCreate(&stream)); uint64_t frame_id=0; while (true){ cv::Mat frame_bgr; if(!cap.read(frame_bgr)) break; // Prepare TRT input LetterboxInfo lb; cv::Mat lbimg = letterbox(frame_bgr, args.img, lb); cv::Mat img_f; lbimg.convertTo(img_f, CV_32FC3, 1.0/255.0); std::vector<float> chw(3*args.img*args.img); int hw=args.img*args.img; std::vector<cv::Mat> chs{ cv::Mat(args.img,args.img,CV_32FC1, chw.data()+0*hw), cv::Mat(args.img,args.img,CV_32FC1, chw.data()+1*hw), cv::Mat(args.img,args.img,CV_32FC1, chw.data()+2*hw) }; cv::split(img_f, chs); CHECK_CUDA(cudaMemcpyAsync(dIn, chw.data(), chw.size()*sizeof(float), cudaMemcpyHostToDevice, stream)); ctx->setTensorAddress(inName, dIn); ctx->setTensorAddress(outName, dOut); if (!ctx->enqueueV3(stream)){ std::cerr<<"enqueueV3 failed\n"; break; } CHECK_CUDA(cudaMemcpyAsync(hOut.data(), dOut, hOut.size()*sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK_CUDA(cudaStreamSynchronize(stream)); // Parse detections to *original* frame coordinates std::vector<Det> dets; dets.reserve(512); parse_yolov5_output(hOut.data(), outD, args.conf, lb, src_w, src_h, dets); dets = nms(dets, args.nms); // ---- Write to SHM (with backpressure) ---- frame_id++; int slot_id = (int)(frame_id % RING_SIZE); Slot& slot = shm->slots[slot_id]; // If reader is slow, wait for slot to become free (drop frames if you prefer: just skip waiting). while (__atomic_load_n(&slot.ready, __ATOMIC_ACQUIRE) == SLOT_READY) { // simple sleep to avoid busy-wait burn usleep(1000); } // Mark WRITING __atomic_store_n(&slot.ready, SLOT_WRITING, __ATOMIC_RELEASE); slot.frame_id = frame_id; slot.fps = fps; // Copy image (prefer original size if fits) int out_w = frame_bgr.cols; int out_h = frame_bgr.rows; cv::Mat rgb; cv::cvtColor(frame_bgr, rgb, cv::COLOR_BGR2RGB); if (out_w > WIDTH_MAX || out_h > HEIGHT_MAX) { // Resize to fit capacity, adjust dets accordingly double sx = (double)WIDTH_MAX / out_w; double sy = (double)HEIGHT_MAX / out_h; cv::Mat resized; cv::resize(rgb, resized, cv::Size((int)(out_w*sx), (int)(out_h*sy))); out_w = resized.cols; out_h = resized.rows; // scale dets for (auto& d : dets){ d.box.x *= (float)sx; d.box.y *= (float)sy; d.box.width *= (float)sx; d.box.height *= (float)sy; } std::memcpy(slot.img, resized.data, (size_t)out_w*out_h*3); } else { std::memcpy(slot.img, rgb.data, (size_t)out_w*out_h*3); } slot.img_w = out_w; slot.img_h = out_h; // Copy detections (clamped to MAX_DETS) int M = std::min((int)dets.size(), MAX_DETS); slot.num_dets = M; for (int i=0;i<M;i++){ slot.dets[i].x1 = dets[i].box.x; slot.dets[i].y1 = dets[i].box.y; slot.dets[i].x2 = dets[i].box.x + dets[i].box.width; slot.dets[i].y2 = dets[i].box.y + dets[i].box.height; slot.dets[i].score = dets[i].score; slot.dets[i].class_id= dets[i].cls; } // Publish __atomic_thread_fence(__ATOMIC_SEQ_CST); __atomic_store_n(&slot.ready, SLOT_READY, __ATOMIC_RELEASE); __atomic_store_n(&shm->write_frame, frame_id, __ATOMIC_RELEASE); // (Optional debug) // std::cout<<"Frame "<<frame_id<<" dets="<<M<<" img="<<out_w<<"x"<<out_h<<"\n"; if (cv::waitKey(1)==27) break; } cudaStreamDestroy(stream); cudaFree(dIn); cudaFree(dOut); eng->destroy(); rt->destroy(); munmap(base, shm_size()); close(fd); shm_unlink(args.shm_name.c_str()); return 0; }

Build (example):

bash
g++ -std=c++17 detector_trt_shm.cpp -o detector_trt_shm \ `pkg-config --cflags --libs opencv4` \ -L/path/to/TensorRT/lib -lnvinfer -lcudart -lpthread -lrt

Run:

bash
./detector_trt_shm --path input.mp4 --trt yolov5.engine --img 640 --conf 0.25 --nms 0.45 --shm /yolo_shm

4) Python consumer (tracker side) – tracker_shm.py

This shows a correct reader you can drop into your project. It reads frames + dets from SHM safely and returns frame (BGR), dets (Nx6 float32: x1,y1,x2,y2,score,class).

If you want to keep your full HOPTracker loop, just replace your open_shm, read_frame, and slot_to_tensor with the ones below and keep the rest of your logic.

python
# tracker_shm.py import os, time, mmap, argparse, ctypes as C import numpy as np import cv2 from shm_layout import ShmBlock, Slot, Detection, RING_SIZE, MAX_DETS, SLOT_READY, SLOT_FREE def open_shm(name: str): """ Posix shm_open wrapper: Python side opens the file path /dev/shm/<name-without-leading-slash> and mmaps the whole ShmBlock. """ path = f"/dev/shm/{name.lstrip('/')}" fd = os.open(path, os.O_RDWR) size = C.sizeof(ShmBlock) mm = mmap.mmap(fd, size, mmap.MAP_SHARED, mmap.PROT_READ | mmap.PROT_WRITE) os.close(fd) block = ShmBlock.from_buffer(mm) return mm, block def read_slot_blocking(block: ShmBlock, frame_id_expected: int, timeout_ms=1000): """ Waits for the specific frame_id slot to become READY. Returns the Slot or None on timeout. """ t0 = time.time() while True: last = block.write_frame if last >= frame_id_expected: sid = frame_id_expected % RING_SIZE slot = block.slots[sid] if slot.ready == SLOT_READY and slot.frame_id == frame_id_expected: return slot # if writer skipped ahead and overwrote the expected slot, jump forward if last - frame_id_expected >= RING_SIZE: frame_id_expected = last if (time.time() - t0)*1000 > timeout_ms: return None time.sleep(0.0005) def slot_to_numpy_frame(slot: Slot) -> np.ndarray: """ Returns a *copy* of the BGR frame. Copying is essential so we can mark the slot FREE immediately. """ w, h = int(slot.img_w), int(slot.img_h) nbytes = w*h*3 # slice only the used bytes before reshape: buf = np.frombuffer(slot.img, dtype=np.uint8, count=nbytes) rgb = buf.reshape((h, w, 3)).copy() # copy to detach from SHM bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR) return bgr def slot_to_dets(slot: Slot) -> np.ndarray: n = int(slot.num_dets) if n <= 0: return np.zeros((0,6), dtype=np.float32) arr = np.zeros((n,6), dtype=np.float32) for i in range(n): d = slot.dets[i] arr[i,0] = d.x1; arr[i,1] = d.y1; arr[i,2] = d.x2; arr[i,3] = d.y2 arr[i,4] = d.score; arr[i,5] = float(d.class_id) return arr def main(): ap = argparse.ArgumentParser() ap.add_argument("--shm", type=str, default="/yolo_shm") ap.add_argument("--show", action="store_true") args = ap.parse_args() mm, shm = open_shm(args.shm) width = shm.nominal_w height = shm.nominal_h fps = shm.nominal_fps if shm.nominal_fps>0 else 30.0 print(f"Connected to SHM. Nominal stream {width}x{height} @ {fps:.2f}fps") frame_id = 1 try: while True: slot = read_slot_blocking(shm, frame_id, timeout_ms=2000) if slot is None: print("Timeout waiting for frame", frame_id) # skip to newest if writer is alive latest = shm.write_frame if latest > frame_id: frame_id = latest else: break continue frame = slot_to_numpy_frame(slot) dets = slot_to_dets(slot) # mark slot FREE and advance read pointer slot.ready = SLOT_FREE shm.read_frame = frame_id # TODO: integrate with your tracker here: # - Convert dets to torch if needed # - Call HOPTracker.detect_track_fuse or hopping_update # For demo we just draw boxes: for (x1,y1,x2,y2,score,cls) in dets: cv2.rectangle(frame, (int(x1),int(y1)), (int(x2),int(y2)), (0,255,0), 2) cv2.putText(frame, f"{int(cls)}:{score:.2f}", (int(x1),int(y1)-4), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1) if args.show: cv2.imshow("Python tracker (reader)", frame) if cv2.waitKey(1) & 0xFF in [27, ord('q'), ord('Q')]: break frame_id += 1 finally: try: cv2.destroyAllWindows() except: pass mm.close() if __name__ == "__main__": main()

Where to plug into your HOPTracker loop

In your existing script, keep your YOLOX/HOPTracker code as-is, but replace your open_shm, read_frame, slot_to_tensor implementations with the versions above:

  • Use slot_to_numpy_frame(slot) to get frame (BGR).
  • Use slot_to_dets(slot) to get dets as Nx6 float32. If your tracker expects a torch.Tensor, just:
    python
    import torch dets_torch = torch.from_numpy(dets).float().cpu()
  • After you copy the frame, immediately do:
    python
    slot.ready = SLOT_FREE shm.read_frame = frame_id

Why this fixes the intermittent problem

  • No more reshape errors: Python only reshapes the first img_w*img_h*3 bytes, which exactly match what the writer stored.
  • No tearing: The writer sets ready=2 only after all bytes & metadata are written; the reader copies then sets ready=0. Both sides use a sequence fence so field writes aren’t reordered by the compiler/CPU.
  • Consistent coordinates: Detections are in the same coordinate system as the stored frame. If the frame is resized to fit the SHM capacity, the writer scales both the image and the boxes consistently, so overlay/tracking line up.

Notes & options

  • If you want zero copies of the image for speed, you can avoid .copy() in slot_to_numpy_frame and defer slot.ready=SLOT_FREE until you’re done using that frame. The current version copies for safety.
  • If you always want 640×640 (network space) everywhere, store the 640×640 letterboxed image in SHM and do not un-letterbox the detections. Then set slot.img_w=slot.img_h=640 and skip the scaling code. The tracker & overlays will use the net space consistently. (The above code uses original-space by default for better visual fidelity.)
  • If your camera can be >1080p, bump WIDTH_MAX/HEIGHT_MAX in both header & Python mirror (e.g., 3840×2160). Remember that memory = RING_SIZE * WIDTH_MAX * HEIGHT_MAX * 3 bytes (plus dets), so plan accordingly.

If you want, paste your current track_utils_shm.py + HOPTracker callsites and I’ll drop these functions into them for you so it’s one cohesive script.

Share this Q&A