i have c++ detection send frame and dectection wit...

Question

i have c++ detection send frame and dectection with shared memory to python tracker but sometimes i have a problem

write copelete code for both side
and explain clearly

import sys, os, time, math, argparse, mmap, ctypes
import numpy as np
import cv2
from loguru import logger
import torch

import sys
import os
import sys
sys.path.append("/media/rayfan/d/projects/HOP_system/myhop")

اضافه کردن مسیر والد myhop به PYTHONPATH

yolox / tracker

from yolox.data.datasets import COCO_CLASSES
from yolox.utils.visualize import plot_tracking
from yolox.tracker.hop_tracker import HOPTracker, pixel_distribution

از کد قبلی: ابزارهای کمکی برای نرخ دیتکشن داینامیک

from track_utils_shm import dbscan_clustering, detection_rate_adjuster

shared memory layout

from shm_layout import ShmBlock, RING_SIZE, MAX_DETS

def open_shm(shm_name: str):
# POSIX shm_open یک فایل در /dev/shm می‌سازد: /dev/shm/<name_without_slash>
path = f"/dev/shm/{shm_name.lstrip('/')}"
fd = os.open(path, os.O_RDWR)
size = ctypes.sizeof(ShmBlock)
mm = mmap.mmap(fd, size, mmap.MAP_SHARED, mmap.PROT_READ | mmap.PROT_WRITE)
os.close(fd)
block = ShmBlock.from_buffer(mm)
return mm, block

def read_slot(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):

t0 = time.time()

while True:

last = block.write_frame

if last >= frame_id_expected:

slot_id = frame_id_expected % RING_SIZE

slot = block.slots[slot_id]

# if slot.ready == 1 and slot.frame_id == frame_id_expected:

# print("im in **********************************")

# return slot

return slot

if (time.time() - t0) * 1000 > timeout_ms:

return None

time.sleep(0.001)

def read_slot(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):
t0 = time.time()
# frame_id_expected = 0
while True:
last = block.write_frame
if last >= frame_id_expected:
slot_id = frame_id_expected % RING_SIZE
slot = block.slots[slot_id]
return slot # فقط وقتی slot آماده است برگردان
# else:
# continue
if (time.time() - t0) * 1000 > timeout_ms:
return None
time.sleep(0.001)

def read_frame(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):

t0 = time.time()

while True:

last = block.write_frame

if last >= frame_id_expected:

slot_id = frame_id_expected % RING_SIZE

slot = block.slots[slot_id]

if slot.ready == 1 and slot.frame_id == frame_id_expected:

frame = np.frombuffer(slot.img, dtype=np.uint8).reshape((slot.img_h, slot.img_w, 3))

slot.ready = 0 # tell C++ it's safe to reuse

block.read_frame = frame_id_expected

return frame, slot

if (time.time() - t0) * 1000 > timeout_ms:

return None, None

time.sleep(0.001)

def slot_to_tensor(slot):
n = int(slot.num_dets)
if n <= 0:
return None
arr = np.zeros((n, 6), dtype=np.float32)
for i in range(n):
d = slot.dets[i]
arr[i, 0] = d.x1
arr[i, 1] = d.y1
arr[i, 2] = d.x2
arr[i, 3] = d.y2
arr[i, 4] = d.score
arr[i, 5] = float(d.class_id)
return arr

def read_frame(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):
"""
فرض: هر slot یک فریم RGB uint8 با shape (H, W, 3) دارد که در block.slots[slot_id].frame قرار دارد
"""
t0 = time.time()
while True:
last = block.write_frame
if last >= frame_id_expected:
slot_id = frame_id_expected % RING_SIZE
slot = block.slots[slot_id]
# frame = np.frombuffer(slot.frame, dtype=np.uint8)
# frame = np.frombuffer(slot.img, dtype=np.uint8).reshape((HEIGHT_MAX, WIDTH_MAX, 3))
frame = np.frombuffer(slot.img, dtype=np.uint8).reshape((slot.img_h, slot.img_w, 3))
# frame = frame.reshape((block.height, block.width, 3))
# frame = frame.reshape((block.img_h, block.img_w, 3))

text
        return frame, slot
    if (time.time() - t0) * 1000 > timeout_ms:
        return None, None
    time.sleep(0.001)

from ctypes import Structure, c_uint64, c_int32, c_float, c_uint8

ثابت‌ها

WIDTH_MAX = 640
HEIGHT_MAX = 640
MAX_DETS = 1024
RING_SIZE = 8

-----------------------

معادل struct Detection

class Detection(Structure):
pack = 1
fields = [
("x1", c_float),
("y1", c_float),
("x2", c_float),
("y2", c_float),
("score", c_float),
("class_id", c_int32)
]

-----------------------

معادل struct Slot

class Slot(Structure):
pack = 1
fields = [
("frame_id", c_uint64), # شماره فریم
("ready", c_int32), # 0: خالی, 1: آماده خواندن
("num_dets", c_int32), # تعداد دیتکشن‌های معتبر
("img_w", c_int32),
("img_h", c_int32),
("fps", c_float),
("dets", Detection * MAX_DETS),
("img", c_uint8 * (HEIGHT_MAX * WIDTH_MAX * 3)) # RGB
]

-----------------------

معادل struct ShmBlock

class ShmBlock(Structure):
pack = 1
fields = [
("write_frame", c_uint64), # آخرین فریم نوشته‌شده
("read_frame", c_uint64), # آخرین فریم خوانده‌شده
("ring_size", c_int32), # = RING_SIZE
("max_dets", c_int32), # = MAX_DETS
("img_w", c_int32),
("img_h", c_int32),
("fps", c_float),
("slots", Slot * RING_SIZE)
]

def main():
parser = argparse.ArgumentParser()
parser.add_argument('-s', "--source", type=str, default="video", help="video or webcam")
# parser.add_argument('-p', "--path", type=str, required=True, help="input video")
parser.add_argument('-p', '--path', default=None, help='path to video file') # <- default=None makes it optional
parser.add_argument("-m", "--model", type=str, default="yolov5", help="kept for compatibility")
parser.add_argument("--save_result", action="store_true")
parser.add_argument("--dynamic", dest="dynamic", action="store_true")
parser.add_argument("--swift", dest="swift", action="store_true")
parser.add_argument("--accurate",dest="accurate",action="store_true", default=True)
#added by me this line
parser.add_argument("--dis_traj", action="store_true", help="display trajectories")

text
# tracking args (همان قبلی)
parser.add_argument("--track_thresh", type=float, default=0.3)
parser.add_argument("--track_buffer", type=int,   default=200) #60
parser.add_argument("--match_thresh", type=float, default=0.2)
parser.add_argument("--aspect_ratio_thresh", type=float, default=0.25)
parser.add_argument('--min_box_area', type=float, default=1)

parser.add_argument("--shm", type=str, default="/yolo_shm", help="shared memory name")

args = parser.parse_args()
shm_mm, shm = open_shm(args.shm)

# # ویدیو برای نمایش/ذخیره
# cap = cv2.VideoCapture(args.path)
# if not cap.isOpened():
#     print("Failed to open video")
#     return
# width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
# height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# fps    = cap.get(cv2.CAP_PROP_FPS) or shm.fps or 30.0

width  = shm.img_w
height = shm.img_h
# width = 640
# height = 640
fps    = shm.fps

if args.save_result:
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter("tracked.mp4", fourcc, fps, (width, height))
else:
    out = None

# انتخاب استراتژی نمونه‌برداری (مثل قبل)
if args.swift: sampling_strategy = 1
elif args.accurate: sampling_strategy = 2
else: sampling_strategy = 0

detection_rate = 9  # مقدار اولیه
if args.path is None:
    args.path = "no_video"
tracker = HOPTracker(args, frame_rate=math.ceil(fps))
light_multi_tracker = cv2.legacy.MultiTracker_create()

frame_id = 1
predicted_bbox = []
online_targets = []
light_tracker_id = []

# حلقه اصلی
while True:
    # ok, frame = cap.read()
    # if not ok: break

    # slot_id = frame_id % RING_SIZE
    # slot = shm.slots[slot_id]
    # frame = np.frombuffer(slot.img, dtype=np.uint8).reshape((HEIGHT_MAX, WIDTH_MAX, 3))

    # slot , frame = read_frame(shm, frame_id)
    # if frame is None:
    #     time.sleep(0.002)
    #     continue


    # frame, slot = read_frame(shm, frame_id)
    # if frame is None or slot is None:
    #     time.sleep(0.002)
    #     continue

    # خواندن frame و slot به ترتیب صحیح (read_frame -> returns (frame, slot))
    frame, slot = read_frame(shm, frame_id)
    # if frame is None or slot is None:
    if frame is None:
        print("frame is none")
        time.sleep(0.002)
        continue
    if slot is None:
        print("slot  is none")
        time.sleep(0.002)
        continue
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

    # # محافظت اضافی (اگر به هر دلیلی frame شیء اشتباهی شد)
    # if not isinstance(frame, np.ndarray):
    #     logger.error(f"Expected numpy.ndarray for frame but got {type(frame)} — skipping frame {frame_id}")
    #     time.sleep(0.002)
    #     continue

    # # هر فریم، دیتکشن مربوط به آن را از shm می‌خوانیم (C++ هر فریم نوشته)
    # slot = read_slot(shm, frame_id)
    # if slot is None:
    #     # اگر C++ عقب افتاده، فریم را اسکپ می‌کنیم یا منتظر می‌مانیم
    #     time.sleep(0.002)
    #     continue
    # # تعریف دیتکشن خالی برای تست
    # slot = type('Slot', (), {})()  # ساخت یک آبجکت خالی
    # slot.num_dets = 0
    # slot.dets = []

    # Before tracking
    start_time = time.time()

    # برای هماهنگی با ساختار قبلی: فقط هر N فریم از دیتکشن استفاده کن
    if frame_id % 4 == 1:    # (قبلاً frame_id % detection_rate == 1)
        light_multi_tracker.clear()
        light_multi_tracker = cv2.legacy.MultiTracker_create()

        # dets = slot_to_tensor(slot)  # Nx6 [x1 y1 x2 y2 score class]
        # if dets is None:
        #     online_im = frame
        # else:
        #     # detect + track fuse (مثل قبل)
        #     online_targets = tracker.detect_track_fuse(
        #         dets, [height, width, frame], (640, 640)
        #     )

        dets = slot_to_tensor(slot)  # Nx6 [x1 y1 x2 y2 score class]
        if dets is None:
            print("dets is None")
            online_im = frame
        else:
            # تبدیل numpy.ndarray به torch.Tensor روی CPU
            dets = torch.from_numpy(dets).float().cpu()
            print("/////////////////////////////////////////////////////////////////")
            print("dets = ", dets)
            for row in dets:
                print([f"{v:.2f}" for v in row])

            print("height = ", height)
            print("width = ", width)
            print("frame = ", frame)
            online_targets = tracker.detect_track_fuse(
                dets, [height, width, frame], (640, 640)
            )

            online_tlwhs, online_ids, online_scores, online_class_id = [], [], [], []
            predicted_bbox = []; light_tracker_id = []

            for t in online_targets:
                print("online_targets=", online_targets)
                t.last_detected_frame = frame_id
                tlwh = t.tlwh; tid = t.track_id
                tlbr = np.asarray(tlwh).copy(); tlbr[2:] += tlbr[:2]
                tlbr = np.append(tlbr, [t.score, t.class_id, tid, *t.mean])
                predicted_bbox.append(tlbr)

                if tlwh[2]*tlwh[3] > args.min_box_area and tlwh[3]/tlwh[2] >= args.aspect_ratio_thresh:
                    hist_b, hist_g, hist_r = pixel_distribution(
                        frame, int(t.tlwh[0]), int(t.tlwh[1]), int(t.tlwh[2]), int(t.tlwh[3])
                    )
                    t.color_dist = [hist_b, hist_g, hist_r]
                    # لایت‌ترکر روی باکس‌های ایستا
                    if t.mean[4]==0 and t.mean[5]==0 and t.mean[6]==0 and t.mean[7]==0:
                        light_multi_tracker.add(cv2.legacy.TrackerMedianFlow_create(), frame,
                            (int(t.tlwh[0]), int(t.tlwh[1]), int(t.tlwh[2]), int(t.tlwh[3])))
                        light_tracker_id.append(tid)

                    online_tlwhs.append(tlwh)
                    online_ids.append(tid)
                    online_scores.append(t.score)
                    online_class_id.append(t.class_id)

            # رندر
            online_im = plot_tracking(
                image=frame, tlwhs=online_tlwhs, obj_ids=online_ids,
                online_class_id=online_class_id, frame_id=frame_id, fps=fps,
                scores=online_scores, class_names=COCO_CLASSES
            )

    else:
        # فریم‌های میانی: فقط آپدیت کالمن/لایت‌ترکر (مثل ساختار قبلی)
        if len(predicted_bbox) != 0 and len(online_targets) != 0:
            if frame_id % max(4, detection_rate) <= 3:
                light_ok, light_bbox = light_multi_tracker.update(frame)
            else:
                light_ok, light_bbox = False, []

            predict_bbox = []
            for idx, each_track in enumerate(online_targets):
                if light_ok and each_track.track_id in light_tracker_id and len(light_bbox)!=0:
                    lid = light_tracker_id.index(each_track.track_id)
                    nb = light_bbox[lid]
                    predict_bbox.append([nb[0], nb[1], nb[0]+nb[2], nb[1]+nb[3],
                                         predicted_bbox[idx][4], predicted_bbox[idx][5]])
                else:
                    new_x = each_track.mean[0] + each_track.mean[4]
                    new_y = each_track.mean[1] + each_track.mean[5]
                    new_a = each_track.mean[2] + each_track.mean[6]
                    new_h = each_track.mean[3] + each_track.mean[7]
                    new_w = new_a * new_h
                    tlwh = [new_x - new_w/2, new_y - new_h/2, new_w, new_h]
                    tlbr = np.asarray(tlwh).copy(); tlbr[2:] += tlbr[:2]
                    tlbr = np.append(tlbr, [predicted_bbox[idx][4], predicted_bbox[idx][5]])
                    predict_bbox.append(tlbr)

            predicted_bbox = np.array(predict_bbox, dtype=np.float32)
            online_targets = tracker.hopping_update(
                predicted_bbox, [height, width, frame], (640, 640)
            )

            online_tlwhs, online_ids, online_scores, online_class_id = [], [], [], []
            predicted_bbox = []
            for t in online_targets:
                tlwh = t.tlwh; tid = t.track_id
                tlbr = np.asarray(tlwh).copy(); tlbr[2:] += tlbr[:2]
                tlbr = np.append(tlbr, [t.score, t.class_id, tid, *t.mean])
                predicted_bbox.append(tlbr)
                if tlwh[2]*tlwh[3] > args.min_box_area and tlwh[3]/tlwh[2] >= args.aspect_ratio_thresh:
                    online_tlwhs.append(tlwh); online_ids.append(tid)
                    online_scores.append(t.score); online_class_id.append(t.class_id)

            online_im = plot_tracking(
                image=frame, tlwhs=online_tlwhs, obj_ids=online_ids,
                online_class_id=online_class_id, frame_id=frame_id, fps=fps,
                scores=online_scores, class_names=COCO_CLASSES
            )
        else:
            online_im = frame

    # Add this right after tracking
    end_time = time.time()
    tracker_time_ms = (end_time - start_time) * 1000
    print(f"Frame {frame_id}: tracker time = {tracker_time_ms:.2f} ms")

    # نمونه‌برداری داینامیک (اختیاری، مثل قبل)
    if (frame_id % 2) == 0 and len(online_targets) != 0: #5
        cluster_dic, cluster_num = dbscan_clustering(online_targets)
        detection_rate = detection_rate_adjuster(cluster_dic, cluster_num,
                                                 os.path.basename(args.path), sampling_strategy)

    # خروجی/نمایش
    if out is not None:
        out.write(online_im)
    else:
        cv2.imshow("HOPTrack (Python Tracker + C++ TRT Detector)", online_im)
        if cv2.waitKey(1) & 0xFF in [27, ord('q'), ord('Q')]:
            break

    frame_id += 1

if out is not None:
    out.release()
# cap.release()
cv2.destroyAllWindows()
shm_mm.close()

if name == "main":
main()

do yo want to send you c++ side?

#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <cmath>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <cstring>
#include <algorithm>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "cuda_runtime.h"
#include "shm_layout.h"
#include <numeric> // for std::iota

using namespace nvinfer1;

class Logger : public nvinfer1::ILogger
{
void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
} logger;

// ---------- ساده‌ساز مدیریت CUDA ----------
#define CHECK_CUDA(expr) do { cudaError_t __e = (expr); if (__e != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(__e) << " @ " << FILE << ":" << LINE << "\n"; exit(1);} } while(0)

struct Args {
std::string path;
std::string trt;
std::string shm_name = "/yolo_shm";
int img = 640;
float conf = 0.25f;
float nms = 0.45f;
bool dynamic = false;
};

static void parse_args(int argc, char** argv, Args& a) {
for (int i=1; i<argc; ++i) {
std::string k = argv[i];
auto next = &{ if(i+1>=argc) {std::cerr<<"Missing value for "<<k<<"\n"; exit(1);} return std::string(argv[++i]); };
if (k=="--path" || k=="-p") a.path = next();
else if (k=="--trt" || k=="-trt") a.trt = next();
else if (k=="--img") a.img = std::stoi(next());
else if (k=="--conf") a.conf = std::stof(next());
else if (k=="--nms") a.nms = std::stof(next());
else if (k=="--dynamic" || k=="--dyn" || k=="--dynamic-shape") a.dynamic = true;
else if (k=="--shm") a.shm_name = next();
else { std::cerr<<"Unknown arg: "<<k<<"\n"; exit(1); }
}
if (a.path.empty() || a.trt.empty()) {
std::cerr<<"Usage: detector_trt_shm --path <video> --trt <engine.trt> [--img 640] [--conf 0.25] [--nms 0.45] [--dynamic] [--shm /yolo_shm]\n";
exit(1);
}
}

// ---------- ابزار Letterbox ----------
struct LetterboxInfo { float scale; int pad_w; int pad_h; int net; };
static cv::Mat letterbox(const cv::Mat& img, int net_size, LetterboxInfo& info) {
int w = img.cols, h = img.rows; info.net = net_size;
float r = std::min((float)net_size / h, (float)net_size / w);
int new_unpad_w = std::round(w * r);
int new_unpad_h = std::round(h * r);
cv::Mat resized; cv::resize(img, resized, cv::Size(new_unpad_w, new_unpad_h), 0, 0, cv::INTER_LINEAR);
int dw = net_size - new_unpad_w;
int dh = net_size - new_unpad_h;
info.scale = r;
info.pad_w = dw/2;
info.pad_h = dh/2;
cv::Mat out(net_size, net_size, CV_8UC3, cv::Scalar(114,114,114));
resized.copyTo(out(cv::Rect(info.pad_w, info.pad_h, resized.cols, resized.rows)));
return out;
}

static inline float iou(const cv::Rect2f& a, const cv::Rect2f& b) {
float inter = (a & b).area();
float uni = a.area() + b.area() - inter + 1e-6f;
return inter / uni;
}

struct Det { cv::Rect2f box; float score; int cls; };

// NMS ساده
static std::vector<Det> nms(const std::vector<Det>& dets, float iou_thr) {
std::vector<int> idxs(dets.size());
std::iota(idxs.begin(), idxs.end(), 0);
std::sort(idxs.begin(), idxs.end(), [&](int i, int j){ return dets[i].score > dets[j].score; });
std::vector<Det> keep;
std::vector<char> removed(dets.size(), 0);
for (size_t _i=0; _i<idxs.size(); ++_i) {
int i = idxs[_i]; if (removed[i]) continue;
keep.push_back(dets[i]);
for (size_t _j=_i+1; _j<idxs.size(); ++_j) {
int j = idxs[_j]; if (removed[j]) continue;
if (iou(dets[i].box, dets[j].box) > iou_thr) removed[j] = 1;
}
}
return keep;
}

// دی‌سریالایز TRT
static ICudaEngine* load_engine(const std::string& path, IRuntime*& runtime) {
std::ifstream f(path, std::ios::binary);
if(!f) { std::cerr<<"Cannot open engine: "<<path<<"\n"; exit(1); }
f.seekg(0, std::ios::end); size_t sz = f.tellg(); f.seekg(0, std::ios::beg);
std::vector<char> buf(sz); f.read(buf.data(), sz);

text
runtime = nvinfer1::createInferRuntime(logger);
if (!runtime) { std::cerr<<"Failed to create runtime\n"; exit(1); }

auto* eng = runtime->deserializeCudaEngine(buf.data(), sz);
if (!eng) { std::cerr<<"Failed to deserialize engine\n"; exit(1); }
return eng;

}

// کمکی: پیدا کردن اندیس ورودی/خروجی
static int find_binding_index(ICudaEngine* engine, bool input=true) {
for (int i=0;i<engine->getNbIOTensors();++i) {
const char* name = engine->getIOTensorName(i);
TensorIOMode mode = engine->getTensorIOMode(name);
if ( (input && mode==TensorIOMode::kINPUT) || (!input && mode==TensorIOMode::kOUTPUT) ) {
return i;
}
}
return -1;
}

// پارس کردن یخروجی‌های متداول YOLOv5 (دو حالت معروف)
static void parse_yolov5_output(const float* out, const Dims& dims, float conf_thr, int net,
const LetterboxInfo& lb, int img_w, int img_h,
std::vector<Det>& out_dets)
{
// حالت A: (1, N, 85) یا (N, 85)
int nb = 1, N = 0, C = 0;
if (dims.nbDims==3) { nb = dims.d[0]; N = dims.d[1]; C = dims.d[2]; }
else if (dims.nbDims==2) { N = dims.d[0]; C = dims.d[1]; }
else { // حالت B: (1, 25200, 7) یا (M, 6)
// تلاش برای حالت عمومی
}
std::cout << "Output dims: nbDims=" << dims.nbDims;
for (int i=0;i<dims.nbDims;i++) std::cout << " " << dims.d[i];
std::cout << std::endl;

text
if (C >= 6 && N > 0) {
    for (int i=0;i<N;i++) {
        const float* p = out + i*C;
        float x = p[0], y = p[1], w = p[2], h = p[3];
        float obj = p[4];
        // پیدا کردن بیشینه کلاس
        int best_cls = -1; float best_conf = 0.f;
        for (int k=5;k<C;k++) {
            float c = p[k];
            if (c > best_conf) { best_conf = c; best_cls = k-5; }
        }
        float score = obj * best_conf;
        if (score < conf_thr) continue;

        // xywh → x1y1x2y2 در فضای شبکه
        float x1 = x - w/2.f;
        float y1 = y - h/2.f;
        float x2 = x + w/2.f;
        float y2 = y + h/2.f;

        // برگرداندن به تصویر اصلی (حذف پدینگ و اسکیل برعکس)
        x1 -= lb.pad_w; x2 -= lb.pad_w;
        y1 -= lb.pad_h; y2 -= lb.pad_h;
        x1 /= lb.scale; x2 /= lb.scale;
        y1 /= lb.scale; y2 /= lb.scale;

        // کِلَمپ به محدوده تصویر
        x1 = std::max(0.f, std::min((float)img_w-1.f, x1));
        y1 = std::max(0.f, std::min((float)img_h-1.f, y1));
        x2 = std::max(0.f, std::min((float)img_w-1.f, x2));
        y2 = std::max(0.f, std::min((float)img_h-1.f, y2));

        out_dets.push_back({ cv::Rect2f(cv::Point2f(x1,y1), cv::Point2f(x2,y2)), score, best_cls });
    }
    return;
}

// حالت جایگزین: اگر خروجی مستقیم [x1,y1,x2,y2,score,cls] بود
if (C==6 || C==7) {
    for (int i=0;i<N;i++) {
        const float* p = out + i*C;
        float x1=p[0], y1=p[1], x2=p[2], y2=p[3], score=p[4];
        int cls = (int)std::round(p[5]);
        if (score < conf_thr) continue;
        // اسکیل برعکس (احتمالاً لازم نیست اگر دکدر در TRT انجام شده)
        out_dets.push_back({ cv::Rect2f(cv::Point2f(x1,y1), cv::Point2f(x2,y2)), score, cls });
    }
    return;
}

// اگر هیچکدام نبود:
// برای سادگی، اینجا خروجی ناشناخته تلقی می‌شود.
std::cerr << "[WARN] Unrecognized output shape; please adapt parser.\n";

}

static size_t shm_size() { return sizeof(ShmBlock); }

int main(int argc, char** argv) {
Args args; parse_args(argc, argv, args);

text
// --- Shared Memory ---
int fd = shm_open(args.shm_name.c_str(), O_CREAT | O_RDWR, 0666);
if (fd < 0) { perror("shm_open"); return 1; }
if (ftruncate(fd, shm_size()) < 0) { perror("ftruncate"); return 1; }
void* base = mmap(nullptr, shm_size(), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (base == MAP_FAILED) { perror("mmap"); return 1; }
auto* shm = reinterpret_cast<ShmBlock*>(base);
std::memset(shm, 0, sizeof(ShmBlock));
shm->ring_size = RING_SIZE;
shm->max_dets  = MAX_DETS;

// --- OpenCV video ---
cv::VideoCapture cap(args.path);
if (!cap.isOpened()) { std::cerr<<"Cannot open video: "<<args.path<<"\n"; return 1; }
int img_w = (int)cap.get(cv::CAP_PROP_FRAME_WIDTH);
int img_h = (int)cap.get(cv::CAP_PROP_FRAME_HEIGHT);
float fps = (float)cap.get(cv::CAP_PROP_FPS);
shm->img_w = img_w; shm->img_h = img_h; shm->fps = fps;

// --- TensorRT ---
IRuntime* runtime=nullptr;
ICudaEngine* engine = load_engine(args.trt, runtime);
// اینجا پیام موفقیت اضافه شود
std::cout << "TensorRT engine loaded successfully from " << args.trt << "\n";


IExecutionContext* ctx = engine->createExecutionContext();
if (!ctx) { std::cerr<<"Failed to create TRT context\n"; return 1; }

// پیدا کردن نام‌ها/ایندکس IO
int in_idx  = find_binding_index(engine, true);
int out_idx = find_binding_index(engine, false);
const char* in_name  = engine->getIOTensorName(in_idx);
const char* out_name = engine->getIOTensorName(out_idx);

// تنظیم شکل ورودی برای داینامیک
if (args.dynamic) {
    Dims inDims; inDims.nbDims = 4; inDims.d[0]=1; inDims.d[1]=3; inDims.d[2]=args.img; inDims.d[3]=args.img;
    if (!ctx->setInputShape(in_name, inDims)) {
        std::cerr<<"setInputShape failed\n"; return 1;
    }
}

// اندازه بافرها
Dims inDims = ctx->getTensorShape(in_name);
Dims outDims= ctx->getTensorShape(out_name);

size_t in_size  = 1;
for (int i=0;i<inDims.nbDims;i++) in_size *= inDims.d[i];
size_t out_size = 1;
for (int i=0;i<outDims.nbDims;i++) out_size *= outDims.d[i];

// بافرهای GPU/CPU
float* d_in=nullptr;  float* d_out=nullptr;
CHECK_CUDA(cudaMalloc(&d_in,  in_size  * sizeof(float)));
CHECK_CUDA(cudaMalloc(&d_out, out_size * sizeof(float)));
std::vector<float> h_out(out_size);

// استریم
cudaStream_t stream; CHECK_CUDA(cudaStreamCreate(&stream));

// اجرای حلقه
uint64_t frame_id = 0;
while (true) {
    cv::Mat frame; if (!cap.read(frame)) break;
    LetterboxInfo lb; cv::Mat lbimg = letterbox(frame, args.img, lb);

    // BGR->RGB, HWC->CHW, float32/255
    cv::Mat img_f; lbimg.convertTo(img_f, CV_32FC3, 1.0/255.0);
    std::vector<float> chw(3 * args.img * args.img);
    int hw = args.img * args.img;
    std::vector<cv::Mat> chs(3);
    for (int i=0;i<3;i++) chs[i] = cv::Mat(args.img, args.img, CV_32FC1, chw.data()+i*hw);
    cv::split(img_f, chs);

    // کپی به GPU
    CHECK_CUDA(cudaMemcpyAsync(d_in, chw.data(), chw.size()*sizeof(float), cudaMemcpyHostToDevice, stream));
    // bind
    ctx->setTensorAddress(in_name,  d_in);
    ctx->setTensorAddress(out_name, d_out);

    // اجرا
    if (!ctx->enqueueV3(stream)) { std::cerr<<"enqueueV3 failed\n"; break; }

    // خروجی به CPU
    CHECK_CUDA(cudaMemcpyAsync(h_out.data(), d_out, h_out.size()*sizeof(float), cudaMemcpyDeviceToHost, stream));
    CHECK_CUDA(cudaStreamSynchronize(stream));

    // پارس خروجی
    std::vector<Det> dets;
    parse_yolov5_output(h_out.data(), outDims, args.conf, args.img, lb, img_w, img_h, dets);

    // NMS
    dets = nms(dets, args.nms);
    /*
    // 🔹 اینجا کد رسم و نمایش اضافه کنید
    for (const auto& d : dets) {
        cv::rectangle(frame, d.box, cv::Scalar(0, 255, 0), 2);
        std::string label = "cls:" + std::to_string(d.cls) + 
                            " conf:" + cv::format("%.2f", d.score);
        int baseLine;
        cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
        cv::rectangle(frame, 
                      cv::Point((int)d.box.x, (int)d.box.y - labelSize.height - 2),
                      cv::Point((int)d.box.x + labelSize.width, (int)d.box.y),
                      cv::Scalar(0, 255, 0), cv::FILLED);
        cv::putText(frame, label, cv::Point((int)d.box.x, (int)d.box.y - 2),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0,0,0), 1);
    }
    */

    cv::imshow("Detections", frame);
    if (cv::waitKey(1) == 27) break; // ESC برای خروج

    // نوشتن در شِردمموری
    frame_id++;
    int slot_id = frame_id % RING_SIZE;
    Slot& slot = shm->slots[slot_id];
    slot.ready = 0; // قفل ساده
    slot.frame_id = frame_id;
    slot.img_w = img_w; slot.img_h = img_h; slot.fps = fps;

    int M = std::min((int)dets.size(), MAX_DETS);
    slot.num_dets = M;
    for (int i=0;i<M;i++) {
        slot.dets[i].x1 = dets[i].box.x;
        slot.dets[i].y1 = dets[i].box.y;
        slot.dets[i].x2 = dets[i].box.x + dets[i].box.width;
        slot.dets[i].y2 = dets[i].box.y + dets[i].box.height;
        slot.dets[i].score = dets[i].score;
        slot.dets[i].class_id = dets[i].cls;
    }
    // 🔹 اینجا قبل از slot.ready = 1 اضافه کنید
cv::Mat rgb;
cv::cvtColor(frame, rgb, cv::COLOR_BGR2RGB);
cv::Mat resized;
cv::resize(rgb, resized, cv::Size(WIDTH_MAX, HEIGHT_MAX));
std::memcpy(slot.img, resized.data, WIDTH_MAX * HEIGHT_MAX * 3);


//cv::imshow("DebugFrame", resized);
//cv::waitKey(1);

// یا ذخیره موقت

text
    slot.ready = 1;
    shm->write_frame = frame_id;
    
            // چاپ نتایج Detection
    std::cout << "Frame " << frame_id << " - Detections: " << M << std::endl;
    for (int i=0;i<M;i++) {
        std::cout << "  [" << i << "] "
                  << "cls=" << slot.dets[i].class_id
                  << " conf=" << slot.dets[i].score
                  << " box=(" << slot.dets[i].x1 << "," << slot.dets[i].y1
                  << ")-(" << slot.dets[i].x2 << "," << slot.dets[i].y2 << ")"
                  << std::endl;
    }

}

// آزادسازی
cudaStreamDestroy(stream);
cudaFree(d_in); cudaFree(d_out);

// آزادسازی TensorRT objects - روش جدید
delete ctx;
delete engine;
delete runtime;

munmap(base, shm_size()); close(fd); shm_unlink(args.shm_name.c_str());
return 0;

}

Answer 1

You’re running into two classic shared-memory pitfalls:

Shape mismatch: C++ writes a fixed WIDTH_MAX×HEIGHT_MAX image (640×640 in your snippet) but sets slot.img_w/h to the original video size (e.g. 1920×1080). Your Python then tries to reshape HEIGHT_MAX*WIDTH_MAX*3 bytes into (img_h,img_w,3) → wrong size & occasional crashes/garbage.
Race conditions: The Python reader ignores slot.ready and slot.frame_id, returns a slot that may still be in write, and never sets slot.ready=0 after reading. That leads to tearing / inconsistent detections.

Below is a robust, complete pair (C++ producer + Python consumer) plus a shared layout that fixes both problems:

A ring buffer with ready states (0=FREE, 1=WRITING, 2=READY).
The writer fills all fields, does a memory fence, then flips ready=2.
The reader waits for ready=2 && frame_id match, copies only the used bytes, then sets ready=0.
The image stored in SHM is the original frame size whenever it fits; if it doesn’t fit, the writer resizes to fit into the buffer and updates img_w/img_h accordingly so Python always reshapes the correct number of bytes.
Python slices the buffer to img_w*img_h*3 before reshape and copies to avoid reuse by the writer while tracking.

1) Shared layout (C++ header) – `shm_layout.h`

cpp
#pragma once
#include <cstdint>

#define WIDTH_MAX   1920     // capacity of the image buffer in SHM
#define HEIGHT_MAX  1080
#define MAX_DETS    1024
#define RING_SIZE   8

// Slot.ready states
enum : int32_t {
    SLOT_FREE    = 0,
    SLOT_WRITING = 1,
    SLOT_READY   = 2
};

#pragma pack(push, 1)
struct Detection {
    float   x1, y1, x2, y2;   // in the coordinate system of the *stored image*
    float   score;
    int32_t class_id;
};

struct Slot {
    uint64_t frame_id;          // monotonically increasing
    int32_t  ready;             // 0=FREE,1=WRITING,2=READY
    int32_t  num_dets;

    int32_t  img_w;             // width of stored RGB image
    int32_t  img_h;             // height of stored RGB image
    float    fps;

    Detection dets[MAX_DETS];   // detections for this frame

    // RGB image bytes. We only use the first (img_w*img_h*3) bytes each frame.
    uint8_t  img[WIDTH_MAX * HEIGHT_MAX * 3];
};

struct ShmBlock {
    uint64_t write_frame;       // last frame_id written (for reader progress)
    uint64_t read_frame;        // last frame_id fully read (for writer stats)
    int32_t  ring_size;         // = RING_SIZE
    int32_t  max_dets;          // = MAX_DETS

    // nominal camera/stream info (may match or exceed per-slot img_w/h)
    int32_t  nominal_w;
    int32_t  nominal_h;
    float    nominal_fps;

    Slot     slots[RING_SIZE];
};
#pragma pack(pop)

2) Python ctypes mirror – `shm_layout.py`

python
# shm_layout.py
import ctypes as C

WIDTH_MAX  = 1920
HEIGHT_MAX = 1080
MAX_DETS   = 1024
RING_SIZE  = 8

SLOT_FREE, SLOT_WRITING, SLOT_READY = 0, 1, 2

class Detection(C.Structure):
    _pack_ = 1
    _fields_ = [
        ("x1", C.c_float), ("y1", C.c_float),
        ("x2", C.c_float), ("y2", C.c_float),
        ("score", C.c_float),
        ("class_id", C.c_int32),
    ]

class Slot(C.Structure):
    _pack_ = 1
    _fields_ = [
        ("frame_id", C.c_uint64),
        ("ready",    C.c_int32),
        ("num_dets", C.c_int32),

        ("img_w", C.c_int32),
        ("img_h", C.c_int32),
        ("fps",   C.c_float),

        ("dets",  Detection * MAX_DETS),
        ("img",   C.c_uint8 * (WIDTH_MAX * HEIGHT_MAX * 3)),
    ]

class ShmBlock(C.Structure):
    _pack_ = 1
    _fields_ = [
        ("write_frame", C.c_uint64),
        ("read_frame",  C.c_uint64),
        ("ring_size",   C.c_int32),
        ("max_dets",    C.c_int32),

        ("nominal_w",   C.c_int32),
        ("nominal_h",   C.c_int32),
        ("nominal_fps", C.c_float),

        ("slots",       Slot * RING_SIZE),
    ]

3) C++ producer (TensorRT YOLO) – `detector_trt_shm.cpp`

This is your code, tightened for SHM correctness and image-size consistency. (Keeps your TRT bits; focus is on how we write the slot.)

cpp
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <cmath>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <cstring>
#include <algorithm>
#include <atomic>
#include <numeric>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "cuda_runtime.h"
#include "shm_layout.h"

using namespace nvinfer1;

struct Logger : ILogger {
    void log(Severity s, const char* msg) noexcept override {
        if (s <= Severity::kWARNING) std::cout << msg << std::endl;
    }
} gLogger;

#define CHECK_CUDA(x) do{auto e=(x); if(e!=cudaSuccess){ \
  std::cerr<<"CUDA: "<<cudaGetErrorString(e)<<" @ "<<__FILE__<<":"<<__LINE__<<"\n"; std::exit(1);} }while(0)

struct Args {
    std::string path;
    std::string trt;
    std::string shm_name = "/yolo_shm";
    int   img = 640;
    float conf = 0.25f;
    float nms  = 0.45f;
    bool  dynamic = false;
};

static void parse_args(int argc, char** argv, Args& a) {
    auto next = [&](int& i){ if (i+1>=argc){ std::cerr<<"Missing value for "<<argv[i]<<"\n"; std::exit(1);} return std::string(argv[++i]); };
    for (int i=1;i<argc;++i){
        std::string k=argv[i];
        if (k=="--path"||k=="-p") a.path=next(i);
        else if (k=="--trt"||k=="-trt") a.trt=next(i);
        else if (k=="--img") a.img=std::stoi(next(i));
        else if (k=="--conf") a.conf=std::stof(next(i));
        else if (k=="--nms") a.nms=std::stof(next(i));
        else if (k=="--dynamic"||k=="--dyn"||k=="--dynamic-shape") a.dynamic=true;
        else if (k=="--shm") a.shm_name=next(i);
        else { std::cerr<<"Unknown arg "<<k<<"\n"; std::exit(1); }
    }
    if (a.path.empty() || a.trt.empty()){
        std::cerr<<"Usage: detector_trt_shm --path <video> --trt <engine.trt> [--img 640] [--conf 0.25] [--nms 0.45] [--dynamic] [--shm /yolo_shm]\n";
        std::exit(1);
    }
}

// --- trivial letterbox for TRT input ---
struct LetterboxInfo { float scale; int pad_w; int pad_h; int net; };
static cv::Mat letterbox(const cv::Mat& img, int net, LetterboxInfo& info) {
    int w=img.cols, h=img.rows; info.net=net;
    float r=std::min(net/(float)h, net/(float)w);
    int nw = std::round(w*r), nh = std::round(h*r);
    cv::Mat resized; cv::resize(img, resized, cv::Size(nw,nh));
    int dw = net - nw, dh = net - nh;
    info.scale=r; info.pad_w=dw/2; info.pad_h=dh/2;
    cv::Mat out(net, net, CV_8UC3, cv::Scalar(114,114,114));
    resized.copyTo(out(cv::Rect(info.pad_w, info.pad_h, resized.cols, resized.rows)));
    return out;
}

// (IoU + NMS helpers)
static inline float iou(const cv::Rect2f& a, const cv::Rect2f& b){
    float inter=(a&b).area(); float uni=a.area()+b.area()-inter+1e-6f; return inter/uni;
}
struct Det { cv::Rect2f box; float score; int cls; };
static std::vector<Det> nms(const std::vector<Det>& ds, float thr){
    std::vector<int> idx(ds.size()); std::iota(idx.begin(), idx.end(), 0);
    std::sort(idx.begin(), idx.end(), [&](int i,int j){return ds[i].score>ds[j].score;});
    std::vector<Det> keep; std::vector<char> rm(ds.size(),0);
    for(size_t a=0;a<idx.size();++a){int i=idx[a]; if(rm[i])continue; keep.push_back(ds[i]);
        for(size_t b=a+1;b<idx.size();++b){int j=idx[b]; if(rm[j])continue; if(iou(ds[i].box,ds[j].box)>thr) rm[j]=1;}}
    return keep;
}

// Parse typical YOLOv5 head output (x,y,w,h,obj, +classes...), *returning boxes in the coordinate system of the stored image*.
static void parse_yolov5_output(const float* out, const Dims& dims,
                                float conf_thr, const LetterboxInfo& lb,
                                int src_w, int src_h, std::vector<Det>& out_dets)
{
    // dims like (1, N, C) or (N, C)
    int N=0, C=0;
    if (dims.nbDims==3){ N=dims.d[1]; C=dims.d[2]; }
    else if (dims.nbDims==2){ N=dims.d[0]; C=dims.d[1]; }
    if (N<=0 || C<6) { std::cerr<<"[WARN] unexpected output dims\n"; return; }

    for (int i=0;i<N;i++){
        const float* p = out + i*C;
        float x=p[0], y=p[1], w=p[2], h=p[3], obj=p[4];
        int best_cls=-1; float best_c=0.f;
        for (int k=5;k<C;k++){ if (p[k]>best_c){ best_c=p[k]; best_cls=k-5; } }
        float score = obj * best_c;
        if (score < conf_thr) continue;

        // xywh (net) -> x1y1x2y2 (net)
        float x1 = x - w/2.f, y1 = y - h/2.f;
        float x2 = x + w/2.f, y2 = y + h/2.f;

        // Undo letterbox to original source space
        x1 -= lb.pad_w; x2 -= lb.pad_w;
        y1 -= lb.pad_h; y2 -= lb.pad_h;
        x1 /= lb.scale; y1 /= lb.scale;
        x2 /= lb.scale; y2 /= lb.scale;

        // clamp
        x1 = std::max(0.f, std::min((float)src_w-1.f, x1));
        y1 = std::max(0.f, std::min((float)src_h-1.f, y1));
        x2 = std::max(0.f, std::min((float)src_w-1.f, x2));
        y2 = std::max(0.f, std::min((float)src_h-1.f, y2));

        out_dets.push_back({ cv::Rect2f({x1,y1},{x2,y2}), score, best_cls });
    }
}

static ICudaEngine* load_engine(const std::string& path, IRuntime*& rt){
    std::ifstream f(path, std::ios::binary);
    if (!f){ std::cerr<<"Cannot open engine "<<path<<"\n"; std::exit(1); }
    f.seekg(0,std::ios::end); size_t sz=f.tellg(); f.seekg(0,std::ios::beg);
    std::vector<char> buf(sz); f.read(buf.data(), sz);
    rt = createInferRuntime(gLogger);
    if (!rt){ std::cerr<<"createInferRuntime failed\n"; std::exit(1); }
    auto* eng = rt->deserializeCudaEngine(buf.data(), sz);
    if (!eng){ std::cerr<<"deserializeCudaEngine failed\n"; std::exit(1); }
    return eng;
}

static size_t shm_size(){ return sizeof(ShmBlock); }

int main(int argc, char** argv){
    Args args; parse_args(argc, argv, args);

    // --- SHM ---
    int fd = shm_open(args.shm_name.c_str(), O_CREAT|O_RDWR, 0666);
    if (fd<0){ perror("shm_open"); return 1; }
    if (ftruncate(fd, shm_size())<0){ perror("ftruncate"); return 1; }
    void* base = mmap(nullptr, shm_size(), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
    if (base==MAP_FAILED){ perror("mmap"); return 1; }
    auto* shm = reinterpret_cast<ShmBlock*>(base);
    std::memset(shm, 0, sizeof(ShmBlock));
    shm->ring_size = RING_SIZE;
    shm->max_dets  = MAX_DETS;

    // --- Video ---
    cv::VideoCapture cap(args.path);
    if (!cap.isOpened()){ std::cerr<<"Cannot open "<<args.path<<"\n"; return 1; }
    int src_w = (int)cap.get(cv::CAP_PROP_FRAME_WIDTH);
    int src_h = (int)cap.get(cv::CAP_PROP_FRAME_HEIGHT);
    float fps = (float)(cap.get(cv::CAP_PROP_FPS) > 0 ? cap.get(cv::CAP_PROP_FPS) : 30.0);
    shm->nominal_w = src_w; shm->nominal_h = src_h; shm->nominal_fps = fps;

    // --- TRT ---
    IRuntime* rt=nullptr; ICudaEngine* eng=load_engine(args.trt, rt);
    std::unique_ptr<IExecutionContext> ctx(eng->createExecutionContext());
    int inIdx=-1, outIdx=-1;
    for (int i=0;i<eng->getNbIOTensors();++i){
        const char* name = eng->getIOTensorName(i);
        if (eng->getTensorIOMode(name)==TensorIOMode::kINPUT)  inIdx=i;
        else                                                   outIdx=i;
    }
    const char* inName  = eng->getIOTensorName(inIdx);
    const char* outName = eng->getIOTensorName(outIdx);

    if (args.dynamic){
        Dims d; d.nbDims=4; d.d[0]=1; d.d[1]=3; d.d[2]=args.img; d.d[3]=args.img;
        if (!ctx->setInputShape(inName, d)){ std::cerr<<"setInputShape failed\n"; return 1; }
    }

    Dims inD = ctx->getTensorShape(inName);
    Dims outD= ctx->getTensorShape(outName);
    size_t inCount=1, outCount=1;
    for (int i=0;i<inD.nbDims;i++)  inCount  *= (size_t)inD.d[i];
    for (int i=0;i<outD.nbDims;i++) outCount *= (size_t)outD.d[i];

    float *dIn=nullptr, *dOut=nullptr;
    CHECK_CUDA(cudaMalloc(&dIn,  inCount * sizeof(float)));
    CHECK_CUDA(cudaMalloc(&dOut, outCount* sizeof(float)));
    std::vector<float> hOut(outCount);
    cudaStream_t stream; CHECK_CUDA(cudaStreamCreate(&stream));

    uint64_t frame_id=0;
    while (true){
        cv::Mat frame_bgr;
        if(!cap.read(frame_bgr)) break;

        // Prepare TRT input
        LetterboxInfo lb; cv::Mat lbimg = letterbox(frame_bgr, args.img, lb);
        cv::Mat img_f; lbimg.convertTo(img_f, CV_32FC3, 1.0/255.0);
        std::vector<float> chw(3*args.img*args.img);
        int hw=args.img*args.img;
        std::vector<cv::Mat> chs{ cv::Mat(args.img,args.img,CV_32FC1, chw.data()+0*hw),
                                  cv::Mat(args.img,args.img,CV_32FC1, chw.data()+1*hw),
                                  cv::Mat(args.img,args.img,CV_32FC1, chw.data()+2*hw) };
        cv::split(img_f, chs);

        CHECK_CUDA(cudaMemcpyAsync(dIn, chw.data(), chw.size()*sizeof(float), cudaMemcpyHostToDevice, stream));
        ctx->setTensorAddress(inName, dIn);
        ctx->setTensorAddress(outName, dOut);
        if (!ctx->enqueueV3(stream)){ std::cerr<<"enqueueV3 failed\n"; break; }
        CHECK_CUDA(cudaMemcpyAsync(hOut.data(), dOut, hOut.size()*sizeof(float), cudaMemcpyDeviceToHost, stream));
        CHECK_CUDA(cudaStreamSynchronize(stream));

        // Parse detections to *original* frame coordinates
        std::vector<Det> dets; dets.reserve(512);
        parse_yolov5_output(hOut.data(), outD, args.conf, lb, src_w, src_h, dets);
        dets = nms(dets, args.nms);

        // ---- Write to SHM (with backpressure) ----
        frame_id++;
        int slot_id = (int)(frame_id % RING_SIZE);
        Slot& slot = shm->slots[slot_id];

        // If reader is slow, wait for slot to become free (drop frames if you prefer: just skip waiting).
        while (__atomic_load_n(&slot.ready, __ATOMIC_ACQUIRE) == SLOT_READY) {
            // simple sleep to avoid busy-wait burn
            usleep(1000);
        }

        // Mark WRITING
        __atomic_store_n(&slot.ready, SLOT_WRITING, __ATOMIC_RELEASE);

        slot.frame_id = frame_id;
        slot.fps      = fps;

        // Copy image (prefer original size if fits)
        int out_w = frame_bgr.cols;
        int out_h = frame_bgr.rows;
        cv::Mat rgb;
        cv::cvtColor(frame_bgr, rgb, cv::COLOR_BGR2RGB);

        if (out_w > WIDTH_MAX || out_h > HEIGHT_MAX) {
            // Resize to fit capacity, adjust dets accordingly
            double sx = (double)WIDTH_MAX  / out_w;
            double sy = (double)HEIGHT_MAX / out_h;
            cv::Mat resized;
            cv::resize(rgb, resized, cv::Size((int)(out_w*sx), (int)(out_h*sy)));
            out_w = resized.cols; out_h = resized.rows;
            // scale dets
            for (auto& d : dets){
                d.box.x      *= (float)sx;
                d.box.y      *= (float)sy;
                d.box.width  *= (float)sx;
                d.box.height *= (float)sy;
            }
            std::memcpy(slot.img, resized.data, (size_t)out_w*out_h*3);
        } else {
            std::memcpy(slot.img, rgb.data, (size_t)out_w*out_h*3);
        }

        slot.img_w = out_w;
        slot.img_h = out_h;

        // Copy detections (clamped to MAX_DETS)
        int M = std::min((int)dets.size(), MAX_DETS);
        slot.num_dets = M;
        for (int i=0;i<M;i++){
            slot.dets[i].x1 = dets[i].box.x;
            slot.dets[i].y1 = dets[i].box.y;
            slot.dets[i].x2 = dets[i].box.x + dets[i].box.width;
            slot.dets[i].y2 = dets[i].box.y + dets[i].box.height;
            slot.dets[i].score   = dets[i].score;
            slot.dets[i].class_id= dets[i].cls;
        }

        // Publish
        __atomic_thread_fence(__ATOMIC_SEQ_CST);
        __atomic_store_n(&slot.ready, SLOT_READY, __ATOMIC_RELEASE);
        __atomic_store_n(&shm->write_frame, frame_id, __ATOMIC_RELEASE);

        // (Optional debug)
        // std::cout<<"Frame "<<frame_id<<" dets="<<M<<" img="<<out_w<<"x"<<out_h<<"\n";
        if (cv::waitKey(1)==27) break;
    }

    cudaStreamDestroy(stream);
    cudaFree(dIn); cudaFree(dOut);
    eng->destroy(); rt->destroy();

    munmap(base, shm_size()); close(fd); shm_unlink(args.shm_name.c_str());
    return 0;
}

Build (example):

bash
g++ -std=c++17 detector_trt_shm.cpp -o detector_trt_shm \
    `pkg-config --cflags --libs opencv4` \
    -L/path/to/TensorRT/lib -lnvinfer -lcudart -lpthread -lrt

Run:

bash
./detector_trt_shm --path input.mp4 --trt yolov5.engine --img 640 --conf 0.25 --nms 0.45 --shm /yolo_shm

4) Python consumer (tracker side) – `tracker_shm.py`

This shows a correct reader you can drop into your project. It reads frames + dets from SHM safely and returns frame (BGR), dets (Nx6 float32: x1,y1,x2,y2,score,class).

If you want to keep your full HOPTracker loop, just replace your open_shm, read_frame, and slot_to_tensor with the ones below and keep the rest of your logic.

python
# tracker_shm.py
import os, time, mmap, argparse, ctypes as C
import numpy as np
import cv2

from shm_layout import ShmBlock, Slot, Detection, RING_SIZE, MAX_DETS, SLOT_READY, SLOT_FREE

def open_shm(name: str):
    """
    Posix shm_open wrapper: Python side opens the file path /dev/shm/<name-without-leading-slash>
    and mmaps the whole ShmBlock.
    """
    path = f"/dev/shm/{name.lstrip('/')}"
    fd = os.open(path, os.O_RDWR)
    size = C.sizeof(ShmBlock)
    mm = mmap.mmap(fd, size, mmap.MAP_SHARED, mmap.PROT_READ | mmap.PROT_WRITE)
    os.close(fd)
    block = ShmBlock.from_buffer(mm)
    return mm, block

def read_slot_blocking(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):
    """
    Waits for the specific frame_id slot to become READY. Returns the Slot or None on timeout.
    """
    t0 = time.time()
    while True:
        last = block.write_frame
        if last >= frame_id_expected:
            sid = frame_id_expected % RING_SIZE
            slot = block.slots[sid]
            if slot.ready == SLOT_READY and slot.frame_id == frame_id_expected:
                return slot
            # if writer skipped ahead and overwrote the expected slot, jump forward
            if last - frame_id_expected >= RING_SIZE:
                frame_id_expected = last
        if (time.time() - t0)*1000 > timeout_ms:
            return None
        time.sleep(0.0005)

def slot_to_numpy_frame(slot: Slot) -> np.ndarray:
    """
    Returns a *copy* of the BGR frame. Copying is essential so we can mark the slot FREE immediately.
    """
    w, h = int(slot.img_w), int(slot.img_h)
    nbytes = w*h*3
    # slice only the used bytes before reshape:
    buf = np.frombuffer(slot.img, dtype=np.uint8, count=nbytes)
    rgb = buf.reshape((h, w, 3)).copy()  # copy to detach from SHM
    bgr = cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
    return bgr

def slot_to_dets(slot: Slot) -> np.ndarray:
    n = int(slot.num_dets)
    if n <= 0:
        return np.zeros((0,6), dtype=np.float32)
    arr = np.zeros((n,6), dtype=np.float32)
    for i in range(n):
        d = slot.dets[i]
        arr[i,0] = d.x1; arr[i,1] = d.y1; arr[i,2] = d.x2; arr[i,3] = d.y2
        arr[i,4] = d.score; arr[i,5] = float(d.class_id)
    return arr

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--shm", type=str, default="/yolo_shm")
    ap.add_argument("--show", action="store_true")
    args = ap.parse_args()

    mm, shm = open_shm(args.shm)
    width  = shm.nominal_w
    height = shm.nominal_h
    fps    = shm.nominal_fps if shm.nominal_fps>0 else 30.0

    print(f"Connected to SHM. Nominal stream {width}x{height} @ {fps:.2f}fps")

    frame_id = 1
    try:
        while True:
            slot = read_slot_blocking(shm, frame_id, timeout_ms=2000)
            if slot is None:
                print("Timeout waiting for frame", frame_id)
                # skip to newest if writer is alive
                latest = shm.write_frame
                if latest > frame_id: frame_id = latest
                else: break
                continue

            frame = slot_to_numpy_frame(slot)
            dets  = slot_to_dets(slot)

            # mark slot FREE and advance read pointer
            slot.ready = SLOT_FREE
            shm.read_frame = frame_id

            # TODO: integrate with your tracker here:
            # - Convert dets to torch if needed
            # - Call HOPTracker.detect_track_fuse or hopping_update
            # For demo we just draw boxes:
            for (x1,y1,x2,y2,score,cls) in dets:
                cv2.rectangle(frame, (int(x1),int(y1)), (int(x2),int(y2)), (0,255,0), 2)
                cv2.putText(frame, f"{int(cls)}:{score:.2f}", (int(x1),int(y1)-4),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,255,0), 1)

            if args.show:
                cv2.imshow("Python tracker (reader)", frame)
                if cv2.waitKey(1) & 0xFF in [27, ord('q'), ord('Q')]:
                    break

            frame_id += 1

    finally:
        try: cv2.destroyAllWindows()
        except: pass
        mm.close()

if __name__ == "__main__":
    main()

Where to plug into your HOPTracker loop

In your existing script, keep your YOLOX/HOPTracker code as-is, but replace your open_shm, read_frame, slot_to_tensor implementations with the versions above:

Use slot_to_numpy_frame(slot) to get frame (BGR).
Use slot_to_dets(slot) to get dets as Nx6 float32. If your tracker expects a torch.Tensor, just:
```
python
import torch
dets_torch = torch.from_numpy(dets).float().cpu()
```

After you copy the frame, immediately do:

python
slot.ready = SLOT_FREE
shm.read_frame = frame_id

Why this fixes the intermittent problem

No more reshape errors: Python only reshapes the first img_w*img_h*3 bytes, which exactly match what the writer stored.
No tearing: The writer sets ready=2 only after all bytes & metadata are written; the reader copies then sets ready=0. Both sides use a sequence fence so field writes aren’t reordered by the compiler/CPU.
Consistent coordinates: Detections are in the same coordinate system as the stored frame. If the frame is resized to fit the SHM capacity, the writer scales both the image and the boxes consistently, so overlay/tracking line up.

Notes & options

If you want zero copies of the image for speed, you can avoid .copy() in slot_to_numpy_frame and defer slot.ready=SLOT_FREE until you’re done using that frame. The current version copies for safety.
If you always want 640×640 (network space) everywhere, store the 640×640 letterboxed image in SHM and do not un-letterbox the detections. Then set slot.img_w=slot.img_h=640 and skip the scaling code. The tracker & overlays will use the net space consistently. (The above code uses original-space by default for better visual fidelity.)
If your camera can be >1080p, bump WIDTH_MAX/HEIGHT_MAX in both header & Python mirror (e.g., 3840×2160). Remember that memory = RING_SIZE * WIDTH_MAX * HEIGHT_MAX * 3 bytes (plus dets), so plan accordingly.

If you want, paste your current track_utils_shm.py + HOPTracker callsites and I’ll drop these functions into them for you so it’s one cohesive script.

Question

اضافه کردن مسیر والد myhop به PYTHONPATH

yolox / tracker

از کد قبلی: ابزارهای کمکی برای نرخ دیتکشن داینامیک

shared memory layout

def read_slot(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):

t0 = time.time()

while True:

last = block.write_frame

if last >= frame_id_expected:

slot_id = frame_id_expected % RING_SIZE

slot = block.slots[slot_id]

# if slot.ready == 1 and slot.frame_id == frame_id_expected:

# print("im in **********************************")

# return slot

return slot

if (time.time() - t0) * 1000 > timeout_ms:

return None

time.sleep(0.001)

def read_frame(block: ShmBlock, frame_id_expected: int, timeout_ms=1000):

t0 = time.time()

while True:

last = block.write_frame

if last >= frame_id_expected:

slot_id = frame_id_expected % RING_SIZE

slot = block.slots[slot_id]

if slot.ready == 1 and slot.frame_id == frame_id_expected:

frame = np.frombuffer(slot.img, dtype=np.uint8).reshape((slot.img_h, slot.img_w, 3))

slot.ready = 0 # tell C++ it's safe to reuse

block.read_frame = frame_id_expected

return frame, slot

if (time.time() - t0) * 1000 > timeout_ms:

return None, None

time.sleep(0.001)

ثابت‌ها

-----------------------

معادل struct Detection

-----------------------

معادل struct Slot

-----------------------

معادل struct ShmBlock

Thought

Writing complete code for both sides

Fixing buffer and synchronization issues

Addressing memory and consistency issues

Addressing issues with data race and reshape

Proposing a solution for frame storage and reshaping

Designing shared memory handshake and frame storage

Designing shared memory structure for frame storage

Designing shared memory integration

Answer

1) Shared layout (C++ header) – shm_layout.h

2) Python ctypes mirror – shm_layout.py

3) C++ producer (TensorRT YOLO) – detector_trt_shm.cpp

4) Python consumer (tracker side) – tracker_shm.py

Why this fixes the intermittent problem

Notes & options

Share this Q&A

1) Shared layout (C++ header) – `shm_layout.h`

2) Python ctypes mirror – `shm_layout.py`

3) C++ producer (TensorRT YOLO) – `detector_trt_shm.cpp`

4) Python consumer (tracker side) – `tracker_shm.py`