我现在使用yolov11进行改进我的目标检测,我的场景是悬挂式单轨小目标异物检测,主要异物有风筝、无...
Creato il: 8 gennaio 2026
Risposto usando GPT-5.2 Pro da Chat01
Creato il: 8 gennaio 2026
Risposto usando GPT-5.2 Pro da Chat01
我现在使用yolov11进行改进我的目标检测,我的场景是悬挂式单轨小目标异物检测,主要异物有风筝、无人机、塑料布、气球、鸟,数据属于不均衡样本,样本不平衡,有的有2千张,有的只有几百张,YOLOv11没有改进前P,R,MAP50,MAP50-95的结果分别如下:0.923,0.828,0.89,0.719,我的配置文件改为下面的代码后:backbone:
head:
[-1, 1, nn.Upsample, [None, 2, "nearest"]] # 11
[[-1, 6], 1, Concat, [1]] # 12
[-1, 2, C3k2, [512, False]] # 13
[-1, 1, nn.Upsample, [None, 2, "nearest"]] # 14
[[-1, 4], 1, Concat, [1]] # 15
[-1, 2, C3k2, [256, False]] # 16
[-1, 1, nn.Upsample, [None, 2, "nearest"]] # 17
[[-1, 2], 1, Concat, [1]] # 18
[-1, 2, C3k2, [128, False]] # 19 P2_out
[-1, 1, SPDConvLiteResEMA, [256, 3, 2, 0.5]] # 20 P2->P3
[[-1, 16], 1, Concat, [1]] # 21
[-1, 2, C3k2, [256, False]] # 22 P3_out
[-1, 1, SPDConvLiteResEMA, [512, 3, 2, 0.5]] # 23 P3->P4
[[-1, 13], 1, Concat, [1]] # 24
[-1, 2, C3k2, [512, False]] # 25 P4_out
[-1, 1, SPDConvLiteResEMA, [1024, 3, 2, 0.5]] # 26 P4->P5
[[-1, 10], 1, Concat, [1]] # 27
[-1, 2, C3k2, [1024, True]] # 28 P5_out
[[19, 22, 25, 28], 1, DetectLSDECD, [nc, 16, 16]]
每个模块的代码如下:class ConvBNAct(nn.Module):
def init(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
super().init()
if p is None:
p = (k - 1) // 2 if isinstance(k, int) else (k[0] // 2, k[1] // 2)
self.conv = nn.Conv2d(c1, c2, k, s, p, groups=g, bias=False)
self.bn = nn.BatchNorm2d(c2)
self.act = nn.SiLU() if act else nn.Identity()
def forward(self, x):
return self.act(self.bn(self.conv(x)))
class MSFAM(nn.Module):
"""
MSFAM(c1, c2):
- residual branch
- main branch: 1x1 -> split into 3 sub-branches:
b1: 3x3 -> 1x1
b2: 5x5 -> 1x1
b3: 1x1
- concat -> 1x1 fuse -> + residual
"""
def init(self, c1, c2, act=True):
super().init()
self.res = ConvBNAct(c1, c2, k=1, s=1, act=act) if c1 != c2 else nn.Identity()
text# main stem 1x1 (keep size) self.stem = ConvBNAct(c1, c2, k=1, s=1, act=act) # three sub-branches self.b1 = nn.Sequential( ConvBNAct(c2, c2, k=3, s=1, act=act), ConvBNAct(c2, c2, k=1, s=1, act=act), ) self.b2 = nn.Sequential( ConvBNAct(c2, c2, k=5, s=1, act=act), ConvBNAct(c2, c2, k=1, s=1, act=act), ) self.b3 = ConvBNAct(c2, c2, k=1, s=1, act=act) # concat(3*c2) -> c2 self.fuse = ConvBNAct(3 * c2, c2, k=1, s=1, act=act) def forward(self, x): r = self.res(x) s = self.stem(x) y1 = self.b1(s) y2 = self.b2(s) y3 = self.b3(s) y = self.fuse(torch.cat([y1, y2, y3], dim=1)) return y + r
class EMA(nn.Module):
def init(self, channels, c2=None, factor=32):
super(EMA, self).init()
self.groups = factor
assert channels % factor == 0
assert channels // self.groups > 0
self.softmax = nn.Softmax(-1)
self.agp = nn.AdaptiveAvgPool2d((1, 1))
self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
self.pool_w = nn.AdaptiveAvgPool2d((1, None))
self.gn = nn.GroupNorm(channels // self.groups, channels // self.groups)
self.conv1x1 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=1, stride=1, padding=0)
self.conv3x3 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=3, stride=1, padding=1)
textdef forward(self, x): b, c, h, w = x.size() group_x = x.reshape(b * self.groups, -1, h, w) # b*g,c//g,h,w x_h = self.pool_h(group_x) x_w = self.pool_w(group_x).permute(0, 1, 3, 2) hw = self.conv1x1(torch.cat([x_h, x_w], dim=2)) x_h, x_w = torch.split(hw, [h, w], dim=2) x1 = self.gn(group_x * x_h.sigmoid() * x_w.permute(0, 1, 3, 2).sigmoid()) x2 = self.conv3x3(group_x) x11 = self.softmax(self.agp(x1).reshape(b * self.groups, -1, 1).permute(0, 2, 1)) x12 = x2.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw x21 = self.softmax(self.agp(x2).reshape(b * self.groups, -1, 1).permute(0, 2, 1)) x22 = x1.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw weights = (torch.matmul(x11, x12) + torch.matmul(x21, x22)).reshape(b * self.groups, 1, h, w) return (group_x * weights.sigmoid()).reshape(b, c, h, w)
import torch
import torch.nn as nn
import torch.nn.functional as F
class SPD(nn.Module):
def init(self, scale=2):
super().init()
assert scale >= 2 and int(scale) == scale
self.scale = int(scale)
textdef forward(self, x): b, c, h, w = x.shape s = self.scale pad_h = (s - h % s) % s pad_w = (s - w % s) % s if pad_h or pad_w: x = F.pad(x, (0, pad_w, 0, pad_h)) b, c, h, w = x.shape x = x.view(b, c, h // s, s, w // s, s) x = x.permute(0, 1, 3, 5, 2, 4).contiguous() return x.view(b, c * s * s, h // s, w // s)
class SPDConvLiteRes(nn.Module):
"""
Identity-Preserving SPDConvLite:
out = skip(x) + alpha * main(SPD(x))
alpha init=0 -> 刚开始等价于普通下采样,不会掉点
"""
def init(self, c1, c2, k=3, s=2, ratio=0.5, act=True):
super().init()
self.spd = SPD(scale=s)
cin = c1 * s * s
hidden = max(16, int(c2 * ratio))
text# baseline 路径:保证“至少不比原来差” self.skip = Conv(c1, c2, 1, s, act=act) # 1x1 stride=s # SPD 增强路径 self.pw1 = Conv(cin, hidden, 1, 1, act=act) self.dw = Conv(hidden, hidden, k, 1, g=hidden, act=act) self.pw2 = Conv(hidden, c2, 1, 1, act=act) # 门控:0 初始化 self.alpha = nn.Parameter(torch.zeros(1)) def forward(self, x): y = self.spd(x) y = self.pw1(y) y = self.dw(y) y = self.pw2(y) return self.skip(x) + self.alpha * y
class SPDConvLiteResEMA(nn.Module):
"""
SPDConvLiteRes + EMA(残差门控)
out = base + alpha * spd_branch; 再做 x = x + beta*(EMA(x)-x)
beta init=0 -> 默认不伤人
"""
def init(self, c1, c2, k=3, s=2, ratio=0.5, ema_factor=32, act=True):
super().init()
self.core = SPDConvLiteRes(c1, c2, k=k, s=s, ratio=ratio, act=act)
self.attn = EMA(c2, factor=ema_factor) # 复用你现有 EMA
self.beta = nn.Parameter(torch.zeros(1)) # 注意力门控(0 初始化)
textdef forward(self, x): y = self.core(x) a = self.attn(y) return y + self.beta * (a - y)
检测头的代码如下:# ultralytics/nn/modules/lsdecd.py
import math
from typing import List
import torch
import torch.nn as nn
import torch.nn.functional as F
from ultralytics.nn.modules.head import Detect
def _choose_gn_groups(c: int, max_groups: int = 16) -> int:
"""选择能整除通道数的 GN groups,避免 GroupNorm 报错。"""
g = min(max_groups, c)
while g > 1 and (c % g) != 0:
g -= 1
return max(g, 1)
class ConvGN(nn.Module):
"""Conv + GroupNorm + SiLU(论文 head 中的 Conv_GN)(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)"""
def init(self, c1, c2, k=1, s=1, p=None, groups=1, gn_groups=16):
super().init()
if p is None:
p = k // 2
self.conv = nn.Conv2d(c1, c2, k, s, p, groups=groups, bias=False)
self.gn = nn.GroupNorm(_choose_gn_groups(c2, gn_groups), c2)
self.act = nn.SiLU()
textdef forward(self, x): return self.act(self.gn(self.conv(x)))
class DEConv2d(nn.Module):
"""
DEConv(Detail-Enhanced Convolution)
论文式(4):DEConv(F) = sum_i FK_i = F(sum_i K_i) = F*K_cvt,可重参数化为普通卷积。(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)
text这里用 1 组可学习 3x3 kernel W 构造: K1: vanilla conv K2: CDC(≈CPDC) K3: ADC(≈APDC) K4: HDC K5: VDC 然后 K_cvt = K1+K2+K3+K4+K5,一次 F.conv2d 完成。 """ def __init__(self, c1, c2, k=3, s=1, p=None, g=1, bias=True): super().__init__() assert k == 3, "本复现按论文图示 DEConv 使用 3x3(如需扩展可再改)" if p is None: p = 1 self.stride = s self.padding = p self.groups = g self.weight = nn.Parameter(torch.empty(c2, c1 // g, 3, 3)) self.bias = nn.Parameter(torch.zeros(c2)) if bias else None nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) self.deploy = False self.reparam_conv = None # 部署态单卷积 @staticmethod def _cpdc_kernel(w: torch.Tensor) -> torch.Tensor: """ CPDC(central pixel difference conv) 等价核: 相当于 y = sum_{i!=5} w_i (x_i - x_5),等价普通卷积核中心为 -sum(neighbors)。 """ wc = w.clone() # neighbors sum (exclude center [1,1]) nsum = (w[:, :, 0, 0] + w[:, :, 0, 1] + w[:, :, 0, 2] + w[:, :, 1, 0] + w[:, :, 1, 2] + w[:, :, 2, 0] + w[:, :, 2, 1] + w[:, :, 2, 2]) wc[:, :, 1, 1] = -nsum return wc @staticmethod def _apdc_kernel(w: torch.Tensor) -> torch.Tensor: """ APDC(angular pixel difference conv) 等价核(3x3): 常用推导的等价系数(见 PDC 补充材料),中心为 0。 以 w1..w9 对应: w1 w2 w3 w4 w5 w6 w7 w8 w9 """ wa = torch.zeros_like(w) w1, w2, w3 = w[:, :, 0, 0], w[:, :, 0, 1], w[:, :, 0, 2] w4, w5, w6 = w[:, :, 1, 0], w[:, :, 1, 1], w[:, :, 1, 2] w7, w8, w9 = w[:, :, 2, 0], w[:, :, 2, 1], w[:, :, 2, 2] wa[:, :, 0, 0] = (w1 - w4) wa[:, :, 0, 1] = (w2 - w1) wa[:, :, 0, 2] = (w3 - w2) wa[:, :, 1, 0] = (w4 - w7) wa[:, :, 1, 1] = 0.0 wa[:, :, 1, 2] = (w6 - w3) wa[:, :, 2, 0] = (w7 - w8) wa[:, :, 2, 1] = (w8 - w9) wa[:, :, 2, 2] = (w9 - w6) return wa @staticmethod def _hdc_kernel(w: torch.Tensor) -> torch.Tensor: """ HDC(horizontal difference conv) 等价核: 典型形式:[[w1,0,-w1],[w4,0,-w4],[w7,0,-w7]] """ wh = torch.zeros_like(w) wh[:, :, 0, 0] = w[:, :, 0, 0] wh[:, :, 1, 0] = w[:, :, 1, 0] wh[:, :, 2, 0] = w[:, :, 2, 0] wh[:, :, 0, 2] = -w[:, :, 0, 0] wh[:, :, 1, 2] = -w[:, :, 1, 0] wh[:, :, 2, 2] = -w[:, :, 2, 0] return wh @staticmethod def _vdc_kernel(w: torch.Tensor) -> torch.Tensor: """ VDC(vertical difference conv) 等价核: 典型形式:[[w1,w2,w3],[0,0,0],[-w1,-w2,-w3]] """ wv = torch.zeros_like(w) wv[:, :, 0, 0] = w[:, :, 0, 0] wv[:, :, 0, 1] = w[:, :, 0, 1] wv[:, :, 0, 2] = w[:, :, 0, 2] wv[:, :, 2, 0] = -w[:, :, 0, 0] wv[:, :, 2, 1] = -w[:, :, 0, 1] wv[:, :, 2, 2] = -w[:, :, 0, 2] return wv def get_equivalent_kernel_bias(self): w = self.weight k1 = w k2 = self._cpdc_kernel(w) # CDC ≈ CPDC k3 = self._apdc_kernel(w) # ADC ≈ APDC k4 = self._hdc_kernel(w) # HDC k5 = self._vdc_kernel(w) # VDC k = k1 + k2 + k3 + k4 + k5 b = self.bias return k, b def switch_to_deploy(self): """把 DEConv 合并成单个 nn.Conv2d(推理更快;符合论文“可重参数化”)。(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)""" if self.deploy: return k, b = self.get_equivalent_kernel_bias() conv = nn.Conv2d( in_channels=k.shape[1] * self.groups, out_channels=k.shape[0], kernel_size=3, stride=self.stride, padding=self.padding, groups=self.groups, bias=(b is not None), ) conv.weight.data.copy_(k) if b is not None: conv.bias.data.copy_(b) self.reparam_conv = conv self.deploy = True # 删除原参数引用 del self.weight if self.bias is not None: del self.bias def forward(self, x): if self.deploy and self.reparam_conv is not None: return self.reparam_conv(x) k, b = self.get_equivalent_kernel_bias() return F.conv2d(x, k, b, stride=self.stride, padding=self.padding, groups=self.groups)
class DEConvGN(nn.Module):
"""DEConv + GroupNorm + SiLU(论文 head 中的 DEConv_GN)(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)"""
def init(self, c1, c2, k=3, s=1, p=1, g=1, gn_groups=16):
super().init()
self.de = DEConv2d(c1, c2, k=k, s=s, p=p, g=g, bias=False)
self.gn = nn.GroupNorm(_choose_gn_groups(c2, gn_groups), c2)
self.act = nn.SiLU()
textdef forward(self, x): return self.act(self.gn(self.de(x))) def switch_to_deploy(self): self.de.switch_to_deploy()
class Scale(nn.Module):
"""每层一个可学习缩放(常见于 decoupled head 的回归分支)"""
def init(self, init=1.0):
super().init()
self.scale = nn.Parameter(torch.tensor(float(init)))
textdef forward(self, x): return x * self.scale
class DetectLSDECD(Detect):
"""
YOLOv11-DEC 的 LSDECD 检测头复现:
- Conv_GN / DEConv_GN 结构(见论文 Fig.3 描述)(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)
- DEConv 可重参数化为普通卷积(论文式(4))(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)
- 使用 GN(论文说明用于复杂场景鲁棒)(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)
textYAML args(不含 ch): [nc, reg_max=16, gn_groups=16] """ def __init__(self, nc=80, reg_max=16, gn_groups=16, ch=()): # 先让父类完成 Detect 必需字段(nl/no/dfl等) try: super().__init__(nc=nc, ch=ch, reg_max=reg_max) except TypeError: super().__init__(nc=nc, ch=ch) self.reg_max = int(reg_max) self.nc = int(nc) self.nl = len(ch) self.gn_groups = int(gn_groups) # reg 输出通道(DFL) # 多数 Detect: reg_out = 4*reg_max self.reg_out = 4 * int(self.reg_max) self.no = self.nc + self.reg_out # 论文是 decoupled head:分类/回归分支分离(Fig.3)(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf) c_ = max(16, ch[0] // 4, self.reg_out) if len(ch) else 64 self.stem = nn.ModuleList([ConvGN(c, c_, k=1, gn_groups=self.gn_groups) for c in ch]) # “shared”的实现:共享两层 DEConvGN(所有尺度共用) self.shared_de1 = DEConvGN(c_, c_, k=3, gn_groups=self.gn_groups) self.shared_de2 = DEConvGN(c_, c_, k=3, gn_groups=self.gn_groups) # 预测层:每尺度一个(便于 bias_init 按 stride 做合理初始化) self.reg_pred = nn.ModuleList([nn.Conv2d(c_, self.reg_out, 1) for _ in ch]) self.cls_pred = nn.ModuleList([nn.Conv2d(c_, self.nc, 1) for _ in ch]) self.scales = nn.ModuleList([Scale(1.0) for _ in ch]) # 必须保证 stride 属性存在(loss 会读) if not hasattr(self, "stride"): self.stride = torch.zeros(self.nl) def forward(self, x: List[torch.Tensor]): # x: list of feature maps from neck outs = [] for i in range(self.nl): f = self.stem[i](x[i]) f = self.shared_de2(self.shared_de1(f)) reg = self.scales[i](self.reg_pred[i](f)) cls = self.cls_pred[i](f) outs.append(torch.cat((reg, cls), 1)) if self.training: return outs # 复用 Detect 的推理解码(不同版本方法名可能不同) if hasattr(self, "_inference"): y = self._inference(outs) return y if getattr(self, "export", False) else (y, outs) if hasattr(self, "inference"): y = self.inference(outs) return y if getattr(self, "export", False) else (y, outs) return outs def switch_to_deploy(self): """把共享 DEConvGN 切到 deploy(合并成普通卷积,加速推理)。""" self.shared_de1.switch_to_deploy() self.shared_de2.switch_to_deploy()
以上的代码效果很好,0.982,0.923,0.968,0.84,以上使用的是基本的CIOU,现在你需要告诉我如何改进LOSS,我使用下面的·LOSS:
class WIoU_Scale:
''' monotonous: {
None: origin v1
True: monotonic FM v2
False: non-monotonic FM v3
}
momentum: The momentum of running mean'''
iou_mean = 1.
monotonous = False
_momentum = 1 - 0.5 ** (1 / 7000)
_is_train = True
def init(self, iou):
self.iou = iou
self._update(self)
@classmethod
def _update(cls, self):
if cls._is_train: cls.iou_mean = (1 - cls._momentum) * cls.iou_mean +
cls._momentum * self.iou.detach().mean().item()
@classmethod
def _scaled_loss(cls, self, gamma=1.9, delta=3):
if isinstance(self.monotonous, bool):
if self.monotonous:
return (self.iou.detach() / self.iou_mean).sqrt()
else:
beta = self.iou.detach() / self.iou_mean
alpha = delta * torch.pow(gamma, beta - delta)
return beta / alpha
return 1
def bbox_iou(box1, box2, xywh=True,
GIoU=False, DIoU=False, CIoU=False, SIoU=False, EIoU=False,
WIoU=False, Focal=False,
Focaler=False, d=0.00, u=0.95,
alpha=1, gamma=0.5, scale=False, eps=1e-7):
Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4)
Get the coordinates of bounding boxes
if xywh: # transform from xywh to xyxy
(x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1)
w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2
b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_
b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_
else: # x1, y1, x2, y2
b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1)
b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1)
w1, h1 = b1_x2 - b1_x1, (b1_y2 - b1_y1).clamp(eps)
w2, h2 = b2_x2 - b2_x1, (b2_y2 - b2_y1).clamp(eps)
Intersection area
inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) *
(b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp(0)
Union Area
union = w1 * h1 + w2 * h2 - inter + eps
✅ raw IoU (0~1),Focaler 映射必须用它
raw_iou = inter / (union + eps)
你原来的 alpha-iou 保留
iou = torch.pow(raw_iou, alpha)
if scale:
self = WIoU_Scale(1 - raw_iou)
if CIoU or DIoU or GIoU or EIoU or SIoU or WIoU:
cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex width
ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height
if CIoU or DIoU or EIoU or SIoU or WIoU:
c2 = (cw ** 2 + ch ** 2) ** alpha + eps
rho2 = (((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 +
(b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4) ** alpha
text
textif CIoU: v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2) with torch.no_grad(): alpha_ciou = v / (v - iou + (1 + eps)) if Focal: return iou - (rho2 / c2 + torch.pow(v * alpha_ciou + eps, alpha)), torch.pow(raw_iou, gamma) else: return iou - (rho2 / c2 + torch.pow(v * alpha_ciou + eps, alpha)) elif EIoU: rho_w2 = ((b2_x2 - b2_x1) - (b1_x2 - b1_x1)) ** 2 rho_h2 = ((b2_y2 - b2_y1) - (b1_y2 - b1_y1)) ** 2 cw2 = torch.pow(cw ** 2 + eps, alpha) ch2 = torch.pow(ch ** 2 + eps, alpha) if Focal: return iou - (rho2 / c2 + rho_w2 / cw2 + rho_h2 / ch2), torch.pow(raw_iou, gamma) else: return iou - (rho2 / c2 + rho_w2 / cw2 + rho_h2 / ch2) elif SIoU: s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5 + eps s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5 + eps sigma = torch.pow(s_cw ** 2 + s_ch ** 2, 0.5) sin_alpha_1 = torch.abs(s_cw) / sigma sin_alpha_2 = torch.abs(s_ch) / sigma threshold = pow(2, 0.5) / 2 sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1) angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2) rho_x = (s_cw / cw) ** 2 rho_y = (s_ch / ch) ** 2 gamma_siou = angle_cost - 2 distance_cost = 2 - torch.exp(gamma_siou * rho_x) - torch.exp(gamma_siou * rho_y) omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2) omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2) shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(1 - torch.exp(-1 * omiga_h), 4) if Focal: return iou - torch.pow(0.5 * (distance_cost + shape_cost) + eps, alpha), torch.pow(raw_iou, gamma) else: return iou - torch.pow(0.5 * (distance_cost + shape_cost) + eps, alpha) elif WIoU: if Focal: raise RuntimeError("WIoU do not support Focal.") exp_term = torch.exp((rho2 / c2)) # WIoU v1 的权重项 # ✅ Focaler 映射:只作用在 raw_iou(0~1) if Focaler: if u <= d + 1e-12: raise ValueError(f"Invalid Focaler interval: u({u}) must be > d({d})") iou_used = ((raw_iou - d) / (u - d)).clamp(0.0, 1.0) else: iou_used = iou if scale: return getattr(WIoU_Scale, '_scaled_loss')(self), (1 - iou_used) * exp_term, iou_used else: return iou_used, exp_term # ✅ 返回“IOU项 + 权重项” if Focal: return iou - rho2 / c2, torch.pow(raw_iou, gamma) else: return iou - rho2 / c2
c_area = cw * ch + eps # convex area
if Focal:
return iou - torch.pow((c_area - union) / c_area + eps, alpha), torch.pow(raw_iou, gamma)
else:
return iou - torch.pow((c_area - union) / c_area + eps, alpha)
if Focal:
return iou, torch.pow(raw_iou, gamma)
else:
return iou
loss:class BboxLoss(nn.Module):
"""Criterion class for computing training losses for bounding boxes."""
def init(self, reg_max=16):
"""Initialize the BboxLoss module with regularization maximum and DFL settings."""
super().init()
self.dfl_loss = DFLoss(reg_max) if reg_max > 1 else None
def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
"""Compute IoU and DFL losses for bounding boxes."""
weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1)
iou_f, exp_term = bbox_iou(
pred_bboxes[fg_mask],
target_bboxes[fg_mask],
xywh=False,
WIoU=True,
Focaler=True,
d=0.10, u=0.90, # 建议先用 0.10~0.90(对不均衡/小目标更稳)
scale=False
)
text
loss_iou = (((1.0 - iou_f) * exp_term.detach()) * weight).sum() / target_scores_sum
if self.dfl_loss:
target_ltrb = bbox2dist(anchor_points, target_bboxes, self.dfl_loss.reg_max - 1)
loss_dfl = self.dfl_loss(pred_dist[fg_mask].view(-1, self.dfl_loss.reg_max), target_ltrb[fg_mask]) * weight
loss_dfl = loss_dfl.sum() / target_scores_sum
else:
loss_dfl = torch.tensor(0.0).to(pred_dist.device)
return loss_iou, loss_dfl
使用这个LOSS后代码结果很差,0.958,0.917,0.941,0.818,只增加这个LOSS,其他模块保持yolov11的原版,结果是0.973,0.875,0.927,0.34,也是有增益,下一步怎么改进这个LOSS,请给出一个合适的思路。
你现在这个现象其实很典型:mAP50 还行甚至上升,但 mAP50-95 明显掉,说明回归在“粗定位”还能到 0.5 IoU,但高 IoU(0.75~0.95)阶段的精修被你现在的回归目标/梯度机制“掐断”了。
下面我先把你当前 WIoU+Focaler 这版为什么掉点讲清楚,然后给你一个“下一步怎么改”的可落地思路(按推荐优先级),包含你可以直接改的代码形态。
你在 WIoU 分支里:
pythoniou_used = ((raw_iou - d) / (u - d)).clamp(0.0, 1.0) loss_iou = (1 - iou_used) * exp_term.detach()
这会带来两个致命问题:
raw_iou < d(比如 <0.10)时,iou_used=0,(1-iou_used)=1 是常数,对 raw_iou 的梯度为 0
→ 这类样本(训练早期、tiny 目标、难样本)回归项基本不提供梯度,只能靠 DFL 推动,通常就会出现:
raw_iou > u(你设 u=0.90)时,iou_used=1,loss_iou=0,梯度也为 0
→ 也就是当框已经比较准(>0.9)以后,你主动停止优化。
但 mAP50-95 里最高阈值是 0.95,你等于把 [email protected] 的提升空间直接砍掉。
这和你看到的结果非常匹配:只换 loss 时 mAP50 还涨、但 mAP50-95 崩到 0.34;在你更强的 backbone+head 上因为特征更强、DFL/分类更强,才没崩得那么夸张,但仍然掉了一截。
你现在:
pythonloss_iou = ((1 - iou_f) * exp_term.detach()) ...
detach 后,WIoU 的“距离注意力项”只作为纯权重缩放梯度大小,而不参与优化中心距离本身(梯度不回传到 rho2/c2)。
这不是绝对不行,但会让它更像“样本重加权”,尺度特别敏感,经常需要重新调 box/dfl loss gain,否则会出现“训练看似收敛但定位精度不够”。
结论先说:
你现在这个任务(小目标 + 长尾 + 高 mAP50-95 追求),最稳、最有效的路线是:回归仍用 CIoU/EIoU 做“本体”,WIoU/Focaler 只做“权重/聚焦”,而且必须避免硬截断导致梯度为 0。
你原来 CIoU 的结果已经很好(mAP50-95=0.84),所以最聪明的做法不是替换掉 CIoU,而是:
(1 - CIoU) 或 (1 - EIoU)用 sigmoid 做一个平滑的区间权重(band-pass),就不会出现梯度硬归零:
python# raw_iou: [0,1] # d 控制低 IoU 抑制起点,u 控制高 IoU 抑制起点 d, u = 0.05, 0.995 # u 千万别用 0.90 tau = 0.05 # 平滑程度(越小越像硬阈值) w_low = torch.sigmoid((raw_iou - d) / tau) # raw_iou<<d -> ~0, >d -> ~1 w_high = torch.sigmoid((u - raw_iou) / tau) # raw_iou<<u -> ~1, >u -> ~0 w = (w_low * w_high) # 给一个下限,保证再差的样本也有一点回归梯度(非常关键) w = (0.2 + 0.8 * w).detach()
然后:
pythonciou = bbox_iou(pred, target, xywh=False, CIoU=True) # 你的原版 loss_iou = (((1.0 - ciou) * w) * weight).sum() / target_scores_sum
为什么这条路线很适合你:
这条路线通常就能把你现在 WIoU+Focaler 掉的那部分 mAP50-95 拿回来,同时在长尾/噪声标签场景保持稳定。
Wise-IoU 论文明确给了 v1/v2/v3:
你现在做的 Focaler clamp,有点像“静态非单调”,但因为 clamp 导致梯度硬断,效果容易翻车。
按论文公式(你 WIoU_Scale 的思路就是这个),但不要再把 IoU 映射后 clamp 掉梯度:
核心思想:
raw_iou(或 1-raw_iou)来算 r(梯度增益系数),r 只做权重(可 detach)LIoU = 1 - raw_iou(或者直接用 WIoU_v1 = R * LIoU)一个很稳的写法是:
pythonraw_iou, exp_term = bbox_iou(..., WIoU=True, Focaler=False, scale=False) # 这里让 bbox_iou 返回 raw_iou(0~1) + exp_term LIoU = 1.0 - raw_iou # v3: beta = LIoU / mean(LIoU), r = beta / (delta * gamma^(beta - delta)) wiou_state = WIoU_Scale(LIoU) # 记得把 init 改成 __init__ r = WIoU_Scale._scaled_loss(wiou_state, gamma=1.9, delta=3) # 论文常用参数组合之一 ([arXiv](https://arxiv.org/pdf/2301.10051v1)) # 防止极端放大/缩小(非常实用) r = r.clamp(0.2, 5.0).detach() # exp_term 也建议做上界裁剪,避免权重爆炸 exp_term = exp_term.clamp(max=10.0).detach() loss_iou = ((r * LIoU * exp_term) * weight).sum() / target_scores_sum
WIoU_Scale.init 改成 __init__,否则实例化不会自动运行你的场景是典型 tiny/小目标异物,IoU 类指标对“像素级偏移”非常敏感;NWD 的思路是把框当作 2D 高斯分布,用 Wasserstein 距离做相似度,对 tiny 目标的小偏移更平滑,可以嵌入 loss/assign/nms 等环节 (arXiv)。
这条路线的优点是:专门补你 mAP50-95 的“精定位稳定性”,而不是靠强行调 IoU 的曲线。
根据 NWD 的简化形式(中心差 + 宽高差)并用指数归一化 (ar5iv):
pythondef nwd_similarity_xyxy(box1, box2, C=32.0, eps=1e-7): # box: (N,4) xyxy x1, y1, x2, y2 = box1.unbind(-1) gx1, gy1, gx2, gy2 = box2.unbind(-1) cx = (x1 + x2) * 0.5 cy = (y1 + y2) * 0.5 gcx = (gx1 + gx2) * 0.5 gcy = (gy1 + gy2) * 0.5 w = (x2 - x1).clamp(min=eps) h = (y2 - y1).clamp(min=eps) gw = (gx2 - gx1).clamp(min=eps) gh = (gy2 - gy1).clamp(min=eps) # W2^2 = (dx^2 + dy^2) + ((dw^2 + dh^2)/4) dist2 = (cx - gcx) ** 2 + (cy - gcy) ** 2 + ((w - gw) ** 2 + (h - gh) ** 2) * 0.25 dist = torch.sqrt(dist2 + eps) # NWD = exp(-dist/C) return torch.exp(-dist / C)
然后在 BboxLoss 里加一项:
pythonnwd = nwd_similarity_xyxy(pred, target, C=C_value) loss_nwd = ((1.0 - nwd).unsqueeze(-1) * weight).sum() / target_scores_sum # 总回归损失 loss_iou = loss_ciou + 0.5 * loss_nwd # 0.3~1.0 都可以试,通常从 0.5 起步
论文里 C 和数据集目标尺寸相关(他们用“平均绝对尺寸”) (ar5iv)。
你的工程里可以用更实用的做法:
C = mean(sqrt(w*h))(统计训练集 GT 的像素尺度)C = imgsz / 10,再微调为了避免“改了很多但不知道是谁带来的”,建议你按这个顺序做 4 次实验(每次只改一处):
loss=(1-raw_iou)*exp_term 或直接回到 CIoU,看 mAP50-95 是否立刻回升bbox 回归是 class-agnostic 的(同一套回归监督),你现在类别不均衡(几百 vs 2千)更容易卡在:
这类问题很多时候改分类 loss更直接,比如 Varifocal Loss 这类“用 IoU 质量监督分类分数”的思路,会改善排序与长尾表现 (CVF Open Access)。
如果你愿意下一步继续往“loss 系统”走,我建议你把回归按路线 A 稳住后,再把分类从普通 BCE/soft label 进一步做成“VFL/QFL 类”的形式。
如果你只想做一次最可能成功的改动(不想反复试错):
✅ 直接用“路线 A:CIoU + soft gate 权重(带下限)”
如果你愿意,我也可以把你现在这套 bbox_iou() 返回值接口重新整理一下(把 raw_iou / ciou / exp_term 统一返回、避免重复计算),这样你做 ablation 会更快更干净。