我现在使用yolov11进行改进我的目标检测，我的场景是悬挂式单轨小目标异物检测，主要异物有风筝、无...

Question

我现在使用yolov11进行改进我的目标检测，我的场景是悬挂式单轨小目标异物检测，主要异物有风筝、无人机、塑料布、气球、鸟，数据属于不均衡样本，样本不平衡，有的有2千张，有的只有几百张，YOLOv11没有改进前P,R,MAP50,MAP50-95的结果分别如下：0.923，0.828，0.89，0.719，我的配置文件改为下面的代码后：backbone:

[-1, 1, Conv, [64, 3, 2]] # 0
[-1, 1, Conv, [128, 3, 2]] # 1
[-1, 1, MSFAM, [256]] # 2
[-1, 1, Conv, [256, 3, 2]] # 3
[-1, 1, MSFAM, [512]] # 4
[-1, 1, Conv, [512, 3, 2]] # 5
[-1, 2, C3k2, [512, True]] # 6
[-1, 1, Conv, [1024, 3, 2]] # 7
[-1, 2, C3k2, [1024, True]] # 8
[-1, 1, SPPF, [1024, 5]] # 9
[-1, 2, C2PSA, [1024]] # 10

head:

top-down PAN

[-1, 1, nn.Upsample, [None, 2, "nearest"]] # 11
[[-1, 6], 1, Concat, [1]] # 12
[-1, 2, C3k2, [512, False]] # 13
[-1, 1, nn.Upsample, [None, 2, "nearest"]] # 14
[[-1, 4], 1, Concat, [1]] # 15
[-1, 2, C3k2, [256, False]] # 16
[-1, 1, nn.Upsample, [None, 2, "nearest"]] # 17
[[-1, 2], 1, Concat, [1]] # 18
[-1, 2, C3k2, [128, False]] # 19 P2_out

bottom-up PAN (replace stride-2 conv with SPDConvLiteRes)

[-1, 1, SPDConvLiteResEMA, [256, 3, 2, 0.5]] # 20 P2->P3
[[-1, 16], 1, Concat, [1]] # 21
[-1, 2, C3k2, [256, False]] # 22 P3_out
[-1, 1, SPDConvLiteResEMA, [512, 3, 2, 0.5]] # 23 P3->P4
[[-1, 13], 1, Concat, [1]] # 24
[-1, 2, C3k2, [512, False]] # 25 P4_out
[-1, 1, SPDConvLiteResEMA, [1024, 3, 2, 0.5]] # 26 P4->P5
[[-1, 10], 1, Concat, [1]] # 27
[-1, 2, C3k2, [1024, True]] # 28 P5_out
[[19, 22, 25, 28], 1, DetectLSDECD, [nc, 16, 16]]
每个模块的代码如下：class ConvBNAct(nn.Module):
def init(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
super().init()
if p is None:
p = (k - 1) // 2 if isinstance(k, int) else (k[0] // 2, k[1] // 2)
self.conv = nn.Conv2d(c1, c2, k, s, p, groups=g, bias=False)
self.bn = nn.BatchNorm2d(c2)
self.act = nn.SiLU() if act else nn.Identity()

def forward(self, x):
return self.act(self.bn(self.conv(x)))

============================================================

1) MSFAM (replace first 2 C3k2 in backbone)

Paper: MSFAM has multi-branch conv with 1x1/3x3/5x5 + residual

Ref: (改进YOLOv11n的无人机航拍图像小目标检测模型_王井阳.pdf)

============================================================

class MSFAM(nn.Module):
"""
MSFAM(c1, c2):
- residual branch
- main branch: 1x1 -> split into 3 sub-branches:
b1: 3x3 -> 1x1
b2: 5x5 -> 1x1
b3: 1x1
- concat -> 1x1 fuse -> + residual
"""
def init(self, c1, c2, act=True):
super().init()
self.res = ConvBNAct(c1, c2, k=1, s=1, act=act) if c1 != c2 else nn.Identity()

text
    # main stem 1x1 (keep size)
    self.stem = ConvBNAct(c1, c2, k=1, s=1, act=act)

    # three sub-branches
    self.b1 = nn.Sequential(
        ConvBNAct(c2, c2, k=3, s=1, act=act),
        ConvBNAct(c2, c2, k=1, s=1, act=act),
    )
    self.b2 = nn.Sequential(
        ConvBNAct(c2, c2, k=5, s=1, act=act),
        ConvBNAct(c2, c2, k=1, s=1, act=act),
    )
    self.b3 = ConvBNAct(c2, c2, k=1, s=1, act=act)

    # concat(3*c2) -> c2
    self.fuse = ConvBNAct(3 * c2, c2, k=1, s=1, act=act)

def forward(self, x):
    r = self.res(x)
    s = self.stem(x)
    y1 = self.b1(s)
    y2 = self.b2(s)
    y3 = self.b3(s)
    y = self.fuse(torch.cat([y1, y2, y3], dim=1))
    return y + r

class EMA(nn.Module):
def init(self, channels, c2=None, factor=32):
super(EMA, self).init()
self.groups = factor
assert channels % factor == 0
assert channels // self.groups > 0
self.softmax = nn.Softmax(-1)
self.agp = nn.AdaptiveAvgPool2d((1, 1))
self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
self.pool_w = nn.AdaptiveAvgPool2d((1, None))
self.gn = nn.GroupNorm(channels // self.groups, channels // self.groups)
self.conv1x1 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=1, stride=1, padding=0)
self.conv3x3 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=3, stride=1, padding=1)

text
def forward(self, x):
    b, c, h, w = x.size()
    group_x = x.reshape(b * self.groups, -1, h, w)  # b*g,c//g,h,w
    x_h = self.pool_h(group_x)
    x_w = self.pool_w(group_x).permute(0, 1, 3, 2)
    hw = self.conv1x1(torch.cat([x_h, x_w], dim=2))
    x_h, x_w = torch.split(hw, [h, w], dim=2)
    x1 = self.gn(group_x * x_h.sigmoid() * x_w.permute(0, 1, 3, 2).sigmoid())
    x2 = self.conv3x3(group_x)
    x11 = self.softmax(self.agp(x1).reshape(b * self.groups, -1, 1).permute(0, 2, 1))
    x12 = x2.reshape(b * self.groups, c // self.groups, -1)  # b*g, c//g, hw
    x21 = self.softmax(self.agp(x2).reshape(b * self.groups, -1, 1).permute(0, 2, 1))
    x22 = x1.reshape(b * self.groups, c // self.groups, -1)  # b*g, c//g, hw
    weights = (torch.matmul(x11, x12) + torch.matmul(x21, x22)).reshape(b * self.groups, 1, h, w)
    return (group_x * weights.sigmoid()).reshape(b, c, h, w)

import torch
import torch.nn as nn
import torch.nn.functional as F

你工程里已有的 Conv 直接复用（Ultralytics 的 Conv）

from ultralytics.nn.modules.conv import Conv

class SPD(nn.Module):
def init(self, scale=2):
super().init()
assert scale >= 2 and int(scale) == scale
self.scale = int(scale)

text
def forward(self, x):
    b, c, h, w = x.shape
    s = self.scale
    pad_h = (s - h % s) % s
    pad_w = (s - w % s) % s
    if pad_h or pad_w:
        x = F.pad(x, (0, pad_w, 0, pad_h))
        b, c, h, w = x.shape
    x = x.view(b, c, h // s, s, w // s, s)
    x = x.permute(0, 1, 3, 5, 2, 4).contiguous()
    return x.view(b, c * s * s, h // s, w // s)

class SPDConvLiteRes(nn.Module):
"""
Identity-Preserving SPDConvLite：
out = skip(x) + alpha * main(SPD(x))
alpha init=0 -> 刚开始等价于普通下采样，不会掉点
"""
def init(self, c1, c2, k=3, s=2, ratio=0.5, act=True):
super().init()
self.spd = SPD(scale=s)
cin = c1 * s * s
hidden = max(16, int(c2 * ratio))

text
    # baseline 路径：保证“至少不比原来差”
    self.skip = Conv(c1, c2, 1, s, act=act)  # 1x1 stride=s

    # SPD 增强路径
    self.pw1 = Conv(cin, hidden, 1, 1, act=act)
    self.dw  = Conv(hidden, hidden, k, 1, g=hidden, act=act)
    self.pw2 = Conv(hidden, c2, 1, 1, act=act)

    # 门控：0 初始化
    self.alpha = nn.Parameter(torch.zeros(1))

def forward(self, x):
    y = self.spd(x)
    y = self.pw1(y)
    y = self.dw(y)
    y = self.pw2(y)
    return self.skip(x) + self.alpha * y

class SPDConvLiteResEMA(nn.Module):
"""
SPDConvLiteRes + EMA(残差门控)
out = base + alpha * spd_branch; 再做 x = x + beta*(EMA(x)-x)
beta init=0 -> 默认不伤人
"""
def init(self, c1, c2, k=3, s=2, ratio=0.5, ema_factor=32, act=True):
super().init()
self.core = SPDConvLiteRes(c1, c2, k=k, s=s, ratio=ratio, act=act)
self.attn = EMA(c2, factor=ema_factor) # 复用你现有 EMA
self.beta = nn.Parameter(torch.zeros(1)) # 注意力门控（0 初始化）

text
def forward(self, x):
    y = self.core(x)
    a = self.attn(y)
    return y + self.beta * (a - y)

检测头的代码如下：# ultralytics/nn/modules/lsdecd.py

-- coding: utf-8 --

import math
from typing import List

import torch
import torch.nn as nn
import torch.nn.functional as F

关键：继承官方 Detect，保证 stride / loss / 推理解码兼容

from ultralytics.nn.modules.head import Detect

def _choose_gn_groups(c: int, max_groups: int = 16) -> int:
"""选择能整除通道数的 GN groups，避免 GroupNorm 报错。"""
g = min(max_groups, c)
while g > 1 and (c % g) != 0:
g -= 1
return max(g, 1)

class ConvGN(nn.Module):
"""Conv + GroupNorm + SiLU（论文 head 中的 Conv_GN）(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)"""
def init(self, c1, c2, k=1, s=1, p=None, groups=1, gn_groups=16):
super().init()
if p is None:
p = k // 2
self.conv = nn.Conv2d(c1, c2, k, s, p, groups=groups, bias=False)
self.gn = nn.GroupNorm(_choose_gn_groups(c2, gn_groups), c2)
self.act = nn.SiLU()

text
def forward(self, x):
    return self.act(self.gn(self.conv(x)))

class DEConv2d(nn.Module):
"""
DEConv（Detail-Enhanced Convolution）
论文式(4)：DEConv(F) = sum_i FK_i = F(sum_i K_i) = F*K_cvt，可重参数化为普通卷积。(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)

text
这里用 1 组可学习 3x3 kernel W 构造：
  K1: vanilla conv
  K2: CDC(≈CPDC)
  K3: ADC(≈APDC)
  K4: HDC
  K5: VDC
然后 K_cvt = K1+K2+K3+K4+K5，一次 F.conv2d 完成。
"""
def __init__(self, c1, c2, k=3, s=1, p=None, g=1, bias=True):
    super().__init__()
    assert k == 3, "本复现按论文图示 DEConv 使用 3x3（如需扩展可再改）"
    if p is None:
        p = 1
    self.stride = s
    self.padding = p
    self.groups = g
    self.weight = nn.Parameter(torch.empty(c2, c1 // g, 3, 3))
    self.bias = nn.Parameter(torch.zeros(c2)) if bias else None
    nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))

    self.deploy = False
    self.reparam_conv = None  # 部署态单卷积

@staticmethod
def _cpdc_kernel(w: torch.Tensor) -> torch.Tensor:
    """
    CPDC(central pixel difference conv) 等价核：
    相当于 y = sum_{i!=5} w_i (x_i - x_5)，等价普通卷积核中心为 -sum(neighbors)。
    """
    wc = w.clone()
    # neighbors sum (exclude center [1,1])
    nsum = (w[:, :, 0, 0] + w[:, :, 0, 1] + w[:, :, 0, 2] +
            w[:, :, 1, 0] +                 w[:, :, 1, 2] +
            w[:, :, 2, 0] + w[:, :, 2, 1] + w[:, :, 2, 2])
    wc[:, :, 1, 1] = -nsum
    return wc

@staticmethod
def _apdc_kernel(w: torch.Tensor) -> torch.Tensor:
    """
    APDC(angular pixel difference conv) 等价核（3x3）：
    常用推导的等价系数（见 PDC 补充材料），中心为 0。
    以 w1..w9 对应：
      w1 w2 w3
      w4 w5 w6
      w7 w8 w9
    """
    wa = torch.zeros_like(w)
    w1, w2, w3 = w[:, :, 0, 0], w[:, :, 0, 1], w[:, :, 0, 2]
    w4, w5, w6 = w[:, :, 1, 0], w[:, :, 1, 1], w[:, :, 1, 2]
    w7, w8, w9 = w[:, :, 2, 0], w[:, :, 2, 1], w[:, :, 2, 2]

    wa[:, :, 0, 0] = (w1 - w4)
    wa[:, :, 0, 1] = (w2 - w1)
    wa[:, :, 0, 2] = (w3 - w2)
    wa[:, :, 1, 0] = (w4 - w7)
    wa[:, :, 1, 1] = 0.0
    wa[:, :, 1, 2] = (w6 - w3)
    wa[:, :, 2, 0] = (w7 - w8)
    wa[:, :, 2, 1] = (w8 - w9)
    wa[:, :, 2, 2] = (w9 - w6)
    return wa

@staticmethod
def _hdc_kernel(w: torch.Tensor) -> torch.Tensor:
    """
    HDC(horizontal difference conv) 等价核：
    典型形式：[[w1,0,-w1],[w4,0,-w4],[w7,0,-w7]]
    """
    wh = torch.zeros_like(w)
    wh[:, :, 0, 0] = w[:, :, 0, 0]
    wh[:, :, 1, 0] = w[:, :, 1, 0]
    wh[:, :, 2, 0] = w[:, :, 2, 0]
    wh[:, :, 0, 2] = -w[:, :, 0, 0]
    wh[:, :, 1, 2] = -w[:, :, 1, 0]
    wh[:, :, 2, 2] = -w[:, :, 2, 0]
    return wh

@staticmethod
def _vdc_kernel(w: torch.Tensor) -> torch.Tensor:
    """
    VDC(vertical difference conv) 等价核：
    典型形式：[[w1,w2,w3],[0,0,0],[-w1,-w2,-w3]]
    """
    wv = torch.zeros_like(w)
    wv[:, :, 0, 0] = w[:, :, 0, 0]
    wv[:, :, 0, 1] = w[:, :, 0, 1]
    wv[:, :, 0, 2] = w[:, :, 0, 2]
    wv[:, :, 2, 0] = -w[:, :, 0, 0]
    wv[:, :, 2, 1] = -w[:, :, 0, 1]
    wv[:, :, 2, 2] = -w[:, :, 0, 2]
    return wv

def get_equivalent_kernel_bias(self):
    w = self.weight
    k1 = w
    k2 = self._cpdc_kernel(w)   # CDC ≈ CPDC
    k3 = self._apdc_kernel(w)   # ADC ≈ APDC
    k4 = self._hdc_kernel(w)    # HDC
    k5 = self._vdc_kernel(w)    # VDC
    k = k1 + k2 + k3 + k4 + k5
    b = self.bias
    return k, b

def switch_to_deploy(self):
    """把 DEConv 合并成单个 nn.Conv2d（推理更快；符合论文“可重参数化”）。(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)"""
    if self.deploy:
        return
    k, b = self.get_equivalent_kernel_bias()
    conv = nn.Conv2d(
        in_channels=k.shape[1] * self.groups,
        out_channels=k.shape[0],
        kernel_size=3,
        stride=self.stride,
        padding=self.padding,
        groups=self.groups,
        bias=(b is not None),
    )
    conv.weight.data.copy_(k)
    if b is not None:
        conv.bias.data.copy_(b)
    self.reparam_conv = conv
    self.deploy = True
    # 删除原参数引用
    del self.weight
    if self.bias is not None:
        del self.bias

def forward(self, x):
    if self.deploy and self.reparam_conv is not None:
        return self.reparam_conv(x)

    k, b = self.get_equivalent_kernel_bias()
    return F.conv2d(x, k, b, stride=self.stride, padding=self.padding, groups=self.groups)

class DEConvGN(nn.Module):
"""DEConv + GroupNorm + SiLU（论文 head 中的 DEConv_GN）(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)"""
def init(self, c1, c2, k=3, s=1, p=1, g=1, gn_groups=16):
super().init()
self.de = DEConv2d(c1, c2, k=k, s=s, p=p, g=g, bias=False)
self.gn = nn.GroupNorm(_choose_gn_groups(c2, gn_groups), c2)
self.act = nn.SiLU()

text
def forward(self, x):
    return self.act(self.gn(self.de(x)))

def switch_to_deploy(self):
    self.de.switch_to_deploy()

class Scale(nn.Module):
"""每层一个可学习缩放（常见于 decoupled head 的回归分支）"""
def init(self, init=1.0):
super().init()
self.scale = nn.Parameter(torch.tensor(float(init)))

text
def forward(self, x):
    return x * self.scale

class DetectLSDECD(Detect):
"""
YOLOv11-DEC 的 LSDECD 检测头复现：
- Conv_GN / DEConv_GN 结构（见论文 Fig.3 描述）(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)
- DEConv 可重参数化为普通卷积（论文式(4)）(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)
- 使用 GN（论文说明用于复杂场景鲁棒）(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)

text
YAML args（不含 ch）：
  [nc, reg_max=16, gn_groups=16]
"""
def __init__(self, nc=80, reg_max=16, gn_groups=16, ch=()):
    # 先让父类完成 Detect 必需字段（nl/no/dfl等）
    try:
        super().__init__(nc=nc, ch=ch, reg_max=reg_max)
    except TypeError:
        super().__init__(nc=nc, ch=ch)
        self.reg_max = int(reg_max)

    self.nc = int(nc)
    self.nl = len(ch)
    self.gn_groups = int(gn_groups)

    # reg 输出通道（DFL）
    # 多数 Detect: reg_out = 4*reg_max
    self.reg_out = 4 * int(self.reg_max)
    self.no = self.nc + self.reg_out

    # 论文是 decoupled head：分类/回归分支分离（Fig.3）(YOLOv11-DEC_An_Improved_YOLOv11_Model_for_UAV_Detection_in_Complex_Contexts.pdf)
    c_ = max(16, ch[0] // 4, self.reg_out) if len(ch) else 64

    self.stem = nn.ModuleList([ConvGN(c, c_, k=1, gn_groups=self.gn_groups) for c in ch])

    # “shared”的实现：共享两层 DEConvGN（所有尺度共用）
    self.shared_de1 = DEConvGN(c_, c_, k=3, gn_groups=self.gn_groups)
    self.shared_de2 = DEConvGN(c_, c_, k=3, gn_groups=self.gn_groups)

    # 预测层：每尺度一个（便于 bias_init 按 stride 做合理初始化）
    self.reg_pred = nn.ModuleList([nn.Conv2d(c_, self.reg_out, 1) for _ in ch])
    self.cls_pred = nn.ModuleList([nn.Conv2d(c_, self.nc, 1) for _ in ch])
    self.scales = nn.ModuleList([Scale(1.0) for _ in ch])

    # 必须保证 stride 属性存在（loss 会读）
    if not hasattr(self, "stride"):
        self.stride = torch.zeros(self.nl)

def forward(self, x: List[torch.Tensor]):
    # x: list of feature maps from neck
    outs = []
    for i in range(self.nl):
        f = self.stem[i](x[i])
        f = self.shared_de2(self.shared_de1(f))
        reg = self.scales[i](self.reg_pred[i](f))
        cls = self.cls_pred[i](f)
        outs.append(torch.cat((reg, cls), 1))

    if self.training:
        return outs

    # 复用 Detect 的推理解码（不同版本方法名可能不同）
    if hasattr(self, "_inference"):
        y = self._inference(outs)
        return y if getattr(self, "export", False) else (y, outs)
    if hasattr(self, "inference"):
        y = self.inference(outs)
        return y if getattr(self, "export", False) else (y, outs)

    return outs

def switch_to_deploy(self):
    """把共享 DEConvGN 切到 deploy（合并成普通卷积，加速推理）。"""
    self.shared_de1.switch_to_deploy()
    self.shared_de2.switch_to_deploy()

以上的代码效果很好，0.982，0.923，0.968，0.84，以上使用的是基本的CIOU，现在你需要告诉我如何改进LOSS，我使用下面的·LOSS:
class WIoU_Scale:
''' monotonous: {
None: origin v1
True: monotonic FM v2
False: non-monotonic FM v3
}
momentum: The momentum of running mean'''

iou_mean = 1.
monotonous = False
_momentum = 1 - 0.5 ** (1 / 7000)
_is_train = True

def init(self, iou):
self.iou = iou
self._update(self)

@classmethod
def _update(cls, self):
if cls._is_train: cls.iou_mean = (1 - cls._momentum) * cls.iou_mean +
cls._momentum * self.iou.detach().mean().item()

@classmethod
def _scaled_loss(cls, self, gamma=1.9, delta=3):
if isinstance(self.monotonous, bool):
if self.monotonous:
return (self.iou.detach() / self.iou_mean).sqrt()
else:
beta = self.iou.detach() / self.iou_mean
alpha = delta * torch.pow(gamma, beta - delta)
return beta / alpha
return 1
def bbox_iou(box1, box2, xywh=True,
GIoU=False, DIoU=False, CIoU=False, SIoU=False, EIoU=False,
WIoU=False, Focal=False,

✅ 新增：Focaler-IoU 映射

Focaler=False, d=0.00, u=0.95,
alpha=1, gamma=0.5, scale=False, eps=1e-7):

Returns Intersection over Union (IoU) of box1(1,4) to box2(n,4)
Get the coordinates of bounding boxes
if xywh: # transform from xywh to xyxy
(x1, y1, w1, h1), (x2, y2, w2, h2) = box1.chunk(4, -1), box2.chunk(4, -1)
w1_, h1_, w2_, h2_ = w1 / 2, h1 / 2, w2 / 2, h2 / 2
b1_x1, b1_x2, b1_y1, b1_y2 = x1 - w1_, x1 + w1_, y1 - h1_, y1 + h1_
b2_x1, b2_x2, b2_y1, b2_y2 = x2 - w2_, x2 + w2_, y2 - h2_, y2 + h2_
else: # x1, y1, x2, y2
b1_x1, b1_y1, b1_x2, b1_y2 = box1.chunk(4, -1)
b2_x1, b2_y1, b2_x2, b2_y2 = box2.chunk(4, -1)
w1, h1 = b1_x2 - b1_x1, (b1_y2 - b1_y1).clamp(eps)
w2, h2 = b2_x2 - b2_x1, (b2_y2 - b2_y1).clamp(eps)

Intersection area
inter = (b1_x2.minimum(b2_x2) - b1_x1.maximum(b2_x1)).clamp(0) *
(b1_y2.minimum(b2_y2) - b1_y1.maximum(b2_y1)).clamp(0)

Union Area
union = w1 * h1 + w2 * h2 - inter + eps

✅ raw IoU (0~1)，Focaler 映射必须用它
raw_iou = inter / (union + eps)

你原来的 alpha-iou 保留
iou = torch.pow(raw_iou, alpha)

if scale:

✅ 用 raw_iou，更稳

self = WIoU_Scale(1 - raw_iou)

if CIoU or DIoU or GIoU or EIoU or SIoU or WIoU:
cw = b1_x2.maximum(b2_x2) - b1_x1.minimum(b2_x1) # convex width
ch = b1_y2.maximum(b2_y2) - b1_y1.minimum(b2_y1) # convex height
if CIoU or DIoU or EIoU or SIoU or WIoU:
c2 = (cw ** 2 + ch ** 2) ** alpha + eps
rho2 = (((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 +
(b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4) ** alpha

text

text
if CIoU:
    v = (4 / math.pi ** 2) * (torch.atan(w2 / h2) - torch.atan(w1 / h1)).pow(2)
    with torch.no_grad():
        alpha_ciou = v / (v - iou + (1 + eps))
    if Focal:
        return iou - (rho2 / c2 + torch.pow(v * alpha_ciou + eps, alpha)), torch.pow(raw_iou, gamma)
    else:
        return iou - (rho2 / c2 + torch.pow(v * alpha_ciou + eps, alpha))

elif EIoU:
    rho_w2 = ((b2_x2 - b2_x1) - (b1_x2 - b1_x1)) ** 2
    rho_h2 = ((b2_y2 - b2_y1) - (b1_y2 - b1_y1)) ** 2
    cw2 = torch.pow(cw ** 2 + eps, alpha)
    ch2 = torch.pow(ch ** 2 + eps, alpha)
    if Focal:
        return iou - (rho2 / c2 + rho_w2 / cw2 + rho_h2 / ch2), torch.pow(raw_iou, gamma)
    else:
        return iou - (rho2 / c2 + rho_w2 / cw2 + rho_h2 / ch2)

elif SIoU:
    s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5 + eps
    s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5 + eps
    sigma = torch.pow(s_cw ** 2 + s_ch ** 2, 0.5)
    sin_alpha_1 = torch.abs(s_cw) / sigma
    sin_alpha_2 = torch.abs(s_ch) / sigma
    threshold = pow(2, 0.5) / 2
    sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1)
    angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2)
    rho_x = (s_cw / cw) ** 2
    rho_y = (s_ch / ch) ** 2
    gamma_siou = angle_cost - 2
    distance_cost = 2 - torch.exp(gamma_siou * rho_x) - torch.exp(gamma_siou * rho_y)
    omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2)
    omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2)
    shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(1 - torch.exp(-1 * omiga_h), 4)
    if Focal:
        return iou - torch.pow(0.5 * (distance_cost + shape_cost) + eps, alpha), torch.pow(raw_iou, gamma)
    else:
        return iou - torch.pow(0.5 * (distance_cost + shape_cost) + eps, alpha)

elif WIoU:
    if Focal:
        raise RuntimeError("WIoU do not support Focal.")

    exp_term = torch.exp((rho2 / c2))  # WIoU v1 的权重项

    # ✅ Focaler 映射：只作用在 raw_iou(0~1)
    if Focaler:
        if u <= d + 1e-12:
            raise ValueError(f"Invalid Focaler interval: u({u}) must be > d({d})")
        iou_used = ((raw_iou - d) / (u - d)).clamp(0.0, 1.0)
    else:
        iou_used = iou

    if scale:
        return getattr(WIoU_Scale, '_scaled_loss')(self), (1 - iou_used) * exp_term, iou_used
    else:
        return iou_used, exp_term  # ✅ 返回“IOU项 + 权重项”

if Focal:
    return iou - rho2 / c2, torch.pow(raw_iou, gamma)
else:
    return iou - rho2 / c2

c_area = cw * ch + eps # convex area
if Focal:
return iou - torch.pow((c_area - union) / c_area + eps, alpha), torch.pow(raw_iou, gamma)
else:
return iou - torch.pow((c_area - union) / c_area + eps, alpha)
if Focal:
return iou, torch.pow(raw_iou, gamma)
else:
return iou
loss：class BboxLoss(nn.Module):
"""Criterion class for computing training losses for bounding boxes."""

def init(self, reg_max=16):
"""Initialize the BboxLoss module with regularization maximum and DFL settings."""
super().init()
self.dfl_loss = DFLoss(reg_max) if reg_max > 1 else None

def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
"""Compute IoU and DFL losses for bounding boxes."""
weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1)

iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True)

loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum

20121224修改✅ WIoU(v1) + Focaler-IoU 映射

iou_f, exp_term = bbox_iou(
pred_bboxes[fg_mask],
target_bboxes[fg_mask],
xywh=False,
WIoU=True,
Focaler=True,
d=0.10, u=0.90, # 建议先用 0.10~0.90（对不均衡/小目标更稳）
scale=False
)

text

exp_term 建议 detach，避免 exp 权重把梯度放大导致不稳定（你注释里的写法也是 detach 权重）

loss_iou = (((1.0 - iou_f) * exp_term.detach()) * weight).sum() / target_scores_sum

11111111

iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True, scale=True)

iou = inner_eiou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=True)

loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum

if type(iou) is tuple:

if len(iou) == 2:

loss_iou = ((1.0 - iou[0]) * iou[1].detach() * weight).sum() / target_scores_sum

else:

loss_iou = (iou[0] * iou[1] * weight).sum() / target_scores_sum

else:

loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum

DFL loss

if self.dfl_loss:
target_ltrb = bbox2dist(anchor_points, target_bboxes, self.dfl_loss.reg_max - 1)
loss_dfl = self.dfl_loss(pred_dist[fg_mask].view(-1, self.dfl_loss.reg_max), target_ltrb[fg_mask]) * weight
loss_dfl = loss_dfl.sum() / target_scores_sum
else:
loss_dfl = torch.tensor(0.0).to(pred_dist.device)

return loss_iou, loss_dfl
使用这个LOSS后代码结果很差，0.958，0.917，0.941，0.818，只增加这个LOSS，其他模块保持yolov11的原版，结果是0.973，0.875，0.927，0.34，也是有增益，下一步怎么改进这个LOSS，请给出一个合适的思路。

Answer 1

你现在这个现象其实很典型：mAP50 还行甚至上升，但 mAP50-95 明显掉，说明回归在“粗定位”还能到 0.5 IoU，但高 IoU（0.75~0.95）阶段的精修被你现在的回归目标/梯度机制“掐断”了。

下面我先把你当前 WIoU+Focaler 这版为什么掉点讲清楚，然后给你一个“下一步怎么改”的可落地思路（按推荐优先级），包含你可以直接改的代码形态。

1) 你这个 WIoU+Focaler 写法，掉点的关键原因

(A) 你把 IoU 做了硬截断（clamp），大量样本回归梯度 = 0

你在 WIoU 分支里：

python
iou_used = ((raw_iou - d) / (u - d)).clamp(0.0, 1.0)
loss_iou = (1 - iou_used) * exp_term.detach()

这会带来两个致命问题：

raw_iou < d（比如 <0.10）时，iou_used=0，(1-iou_used)=1 是常数，对 raw_iou 的梯度为 0
→ 这类样本（训练早期、tiny 目标、难样本）回归项基本不提供梯度，只能靠 DFL 推动，通常就会出现：
- IoU 能到 0.5~0.7（mAP50 不差）
- 但很难往 0.85~0.95 精修（mAP50-95 掉）
raw_iou > u（你设 u=0.90）时，iou_used=1，loss_iou=0，梯度也为 0
→ 也就是当框已经比较准（>0.9）以后，你主动停止优化。
但 mAP50-95 里最高阈值是 0.95，你等于把 [email protected] 的提升空间直接砍掉。

这和你看到的结果非常匹配：只换 loss 时 mAP50 还涨、但 mAP50-95 崩到 0.34；在你更强的 backbone+head 上因为特征更强、DFL/分类更强，才没崩得那么夸张，但仍然掉了一截。

(B) exp_term 你 detach 了，本质上只剩“加权 IoU”，而且权重会改变损失尺度

你现在：

python
loss_iou = ((1 - iou_f) * exp_term.detach()) ...

detach 后，WIoU 的“距离注意力项”只作为纯权重缩放梯度大小，而不参与优化中心距离本身（梯度不回传到 rho2/c2）。
这不是绝对不行，但会让它更像“样本重加权”，尺度特别敏感，经常需要重新调 box/dfl loss gain，否则会出现“训练看似收敛但定位精度不够”。

2) 下一步怎么改：给你 3 条路线（按我推荐顺序）

结论先说：
你现在这个任务（小目标 + 长尾 + 高 mAP50-95 追求），最稳、最有效的路线是：回归仍用 CIoU/EIoU 做“本体”，WIoU/Focaler 只做“权重/聚焦”，而且必须避免硬截断导致梯度为 0。

路线 A（强烈推荐）：CIoU/EIoU 做回归本体，Focaler/WIoU 只做“权重”，不要改 IoU 本体

你原来 CIoU 的结果已经很好（mAP50-95=0.84），所以最聪明的做法不是替换掉 CIoU，而是：

回归损失仍然用：(1 - CIoU) 或 (1 - EIoU)
你想要的“抑制离群点/强调中等质量样本”，用一个 weight 去乘它（weight 可以 detach，避免引入怪梯度）
weight 必须 连续、非 0（至少给一个下限），避免把梯度掐断

✅ 推荐 weight：用“软门控”(soft gate) 代替 clamp

用 sigmoid 做一个平滑的区间权重（band-pass），就不会出现梯度硬归零：

python
# raw_iou: [0,1]
# d 控制低 IoU 抑制起点，u 控制高 IoU 抑制起点
d, u = 0.05, 0.995        # u 千万别用 0.90
tau = 0.05                # 平滑程度(越小越像硬阈值)

w_low  = torch.sigmoid((raw_iou - d) / tau)     # raw_iou<<d -> ~0, >d -> ~1
w_high = torch.sigmoid((u - raw_iou) / tau)     # raw_iou<<u -> ~1, >u -> ~0
w = (w_low * w_high)

# 给一个下限，保证再差的样本也有一点回归梯度（非常关键）
w = (0.2 + 0.8 * w).detach()

然后：

python
ciou = bbox_iou(pred, target, xywh=False, CIoU=True)  # 你的原版
loss_iou = (((1.0 - ciou) * w) * weight).sum() / target_scores_sum

为什么这条路线很适合你：

不破坏你已经验证过很强的 CIoU 收敛路径
通过 w 抑制“极差样本/极好样本”的竞争，符合 WIoU v3 想做的“更聪明分配梯度”的精神 (arXiv)
关键是：不会出现 raw_iou < d 时回归梯度直接为 0，也不会在 >0.9 时停止精修

这条路线通常就能把你现在 WIoU+Focaler 掉的那部分 mAP50-95 拿回来，同时在长尾/噪声标签场景保持稳定。

路线 B（你坚持要用 WIoU）：用 WIoU v3 的“动态非单调 FM”，但千万别用硬截断 iou_used

Wise-IoU 论文明确给了 v1/v2/v3：

v1：带距离注意力的 WIoU
v2：单调 FM（类似 focal），并引入 running mean 做归一化
v3：动态非单调 FM，通过 outlier degree β 给普通质量样本更大梯度、同时压低高质量与低质量样本的梯度 (arXiv)

你现在做的 Focaler clamp，有点像“静态非单调”，但因为 clamp 导致梯度硬断，效果容易翻车。

✅ 把你的实现改成“WIoU v3 的 r * WIoU_v1”，并且保留 raw_iou 的连续梯度

按论文公式（你 WIoU_Scale 的思路就是这个），但不要再把 IoU 映射后 clamp 掉梯度：

核心思想：

用 raw_iou（或 1-raw_iou）来算 r（梯度增益系数），r 只做权重（可 detach）
真实回归项用连续的 LIoU = 1 - raw_iou（或者直接用 WIoU_v1 = R * LIoU）

一个很稳的写法是：

python
raw_iou, exp_term = bbox_iou(..., WIoU=True, Focaler=False, scale=False)  
# 这里让 bbox_iou 返回 raw_iou(0~1) + exp_term

LIoU = 1.0 - raw_iou

# v3: beta = LIoU / mean(LIoU), r = beta / (delta * gamma^(beta - delta))
wiou_state = WIoU_Scale(LIoU)           # 记得把 init 改成 __init__
r = WIoU_Scale._scaled_loss(wiou_state, gamma=1.9, delta=3)  # 论文常用参数组合之一 ([arXiv](https://arxiv.org/pdf/2301.10051v1))

# 防止极端放大/缩小（非常实用）
r = r.clamp(0.2, 5.0).detach()

# exp_term 也建议做上界裁剪，避免权重爆炸
exp_term = exp_term.clamp(max=10.0).detach()

loss_iou = ((r * LIoU * exp_term) * weight).sum() / target_scores_sum

你必须做的两个“小改动”

把 WIoU_Scale.init 改成 __init__，否则实例化不会自动运行
u 不要 0.90：你如果一定要“压制过高 IoU”，也建议 u ≥ 0.99，或者干脆不用 u，完全交给 v3 的 r 去做动态分配

路线 C（非常适合 tiny 目标）：在 CIoU/DFL 之外，加一个 NWD 分支做辅助回归

你的场景是典型 tiny/小目标异物，IoU 类指标对“像素级偏移”非常敏感；NWD 的思路是把框当作 2D 高斯分布，用 Wasserstein 距离做相似度，对 tiny 目标的小偏移更平滑，可以嵌入 loss/assign/nms 等环节 (arXiv)。

这条路线的优点是：专门补你 mAP50-95 的“精定位稳定性”，而不是靠强行调 IoU 的曲线。

NWD（axis-aligned）一个实用实现（xyxy）

根据 NWD 的简化形式（中心差 + 宽高差）并用指数归一化 (ar5iv)：

python
def nwd_similarity_xyxy(box1, box2, C=32.0, eps=1e-7):
    # box: (N,4) xyxy
    x1, y1, x2, y2 = box1.unbind(-1)
    gx1, gy1, gx2, gy2 = box2.unbind(-1)

    cx  = (x1 + x2) * 0.5
    cy  = (y1 + y2) * 0.5
    gcx = (gx1 + gx2) * 0.5
    gcy = (gy1 + gy2) * 0.5

    w  = (x2 - x1).clamp(min=eps)
    h  = (y2 - y1).clamp(min=eps)
    gw = (gx2 - gx1).clamp(min=eps)
    gh = (gy2 - gy1).clamp(min=eps)

    # W2^2 = (dx^2 + dy^2) + ((dw^2 + dh^2)/4)
    dist2 = (cx - gcx) ** 2 + (cy - gcy) ** 2 + ((w - gw) ** 2 + (h - gh) ** 2) * 0.25
    dist = torch.sqrt(dist2 + eps)

    # NWD = exp(-dist/C)
    return torch.exp(-dist / C)

然后在 BboxLoss 里加一项：

python
nwd = nwd_similarity_xyxy(pred, target, C=C_value)
loss_nwd = ((1.0 - nwd).unsqueeze(-1) * weight).sum() / target_scores_sum

# 总回归损失
loss_iou = loss_ciou + 0.5 * loss_nwd   # 0.3~1.0 都可以试，通常从 0.5 起步

C 怎么取（很关键）

论文里 C 和数据集目标尺寸相关（他们用“平均绝对尺寸”） (ar5iv)。
你的工程里可以用更实用的做法：

C = mean(sqrt(w*h))（统计训练集 GT 的像素尺度）
或者先粗暴设 C = imgsz / 10，再微调

3) 你现在最该做的 ablation（不问你要数据，也能直接开干）

为了避免“改了很多但不知道是谁带来的”，建议你按这个顺序做 4 次实验（每次只改一处）：

去掉 Focaler clamp：先用 loss=(1-raw_iou)*exp_term 或直接回到 CIoU，看 mAP50-95 是否立刻回升
把 u 从 0.90 提到 0.99/1.0：看 [email protected]~0.95 是否恢复（通常会）
把 clamp 换成 sigmoid soft gate + 下限 0.2：稳定提升 mAP50-95 的概率最大
在 CIoU 上叠加 NWD（0.5 系数起步）：专门冲高 IoU 阈值下的定位质量 (arXiv)

4) 额外提醒：你数据“类别不均衡”，bbox loss 改再多也救不了分类长尾

bbox 回归是 class-agnostic 的（同一套回归监督），你现在类别不均衡（几百 vs 2千）更容易卡在：

少样本类 recall 低
或 score 排序不理想（NMS 前后被挤掉）

这类问题很多时候改分类 loss更直接，比如 Varifocal Loss 这类“用 IoU 质量监督分类分数”的思路，会改善排序与长尾表现 (CVF Open Access)。
如果你愿意下一步继续往“loss 系统”走，我建议你把回归按路线 A 稳住后，再把分类从普通 BCE/soft label 进一步做成“VFL/QFL 类”的形式。

给你一个最稳的“下一步建议”

如果你只想做一次最可能成功的改动（不想反复试错）：

✅ 直接用“路线 A：CIoU + soft gate 权重（带下限）”

不改你已经验证过最强的 CIoU 主路径
解决你现在 Focaler clamp 造成的梯度断流
对 mAP50-95 最友好、风险最小

如果你愿意，我也可以把你现在这套 bbox_iou() 返回值接口重新整理一下（把 raw_iou / ciou / exp_term 统一返回、避免重复计算），这样你做 ablation 会更快更干净。

Pregunta

top-down PAN

bottom-up PAN (replace stride-2 conv with SPDConvLiteRes)

============================================================

1) MSFAM (replace first 2 C3k2 in backbone)

Paper: MSFAM has multi-branch conv with 1x1/3x3/5x5 + residual

Ref: (改进YOLOv11n的无人机航拍图像小目标检测模型_王井阳.pdf)

============================================================

你工程里已有的 Conv 直接复用（Ultralytics 的 Conv）

from ultralytics.nn.modules.conv import Conv

-- coding: utf-8 --

关键：继承官方 Detect，保证 stride / loss / 推理解码兼容

✅ 新增：Focaler-IoU 映射

✅ 用 raw_iou，更稳

iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True)

loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum

20121224修改✅ WIoU(v1) + Focaler-IoU 映射

exp_term 建议 detach，避免 exp 权重把梯度放大导致不稳定（你注释里的写法也是 detach 权重）

11111111

iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True, scale=True)

iou = inner_eiou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=True)

loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum

if type(iou) is tuple:

if len(iou) == 2:

loss_iou = ((1.0 - iou[0]) * iou[1].detach() * weight).sum() / target_scores_sum

else:

loss_iou = (iou[0] * iou[1] * weight).sum() / target_scores_sum

else:

loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum

DFL loss

Pensamiento

Respuesta

1) 你这个 WIoU+Focaler 写法，掉点的关键原因

(A) 你把 IoU 做了硬截断（clamp），大量样本回归梯度 = 0

(B) exp_term 你 detach 了，本质上只剩“加权 IoU”，而且权重会改变损失尺度

2) 下一步怎么改：给你 3 条路线（按我推荐顺序）

路线 A（强烈推荐）：CIoU/EIoU 做回归本体，Focaler/WIoU 只做“权重”，不要改 IoU 本体

✅ 推荐 weight：用“软门控”(soft gate) 代替 clamp

路线 B（你坚持要用 WIoU）：用 WIoU v3 的“动态非单调 FM”，但千万别用硬截断 iou_used

✅ 把你的实现改成“WIoU v3 的 r * WIoU_v1”，并且保留 raw_iou 的连续梯度

你必须做的两个“小改动”

路线 C（非常适合 tiny 目标）：在 CIoU/DFL 之外，加一个 NWD 分支做辅助回归

NWD（axis-aligned）一个实用实现（xyxy）

C 怎么取（很关键）

3) 你现在最该做的 ablation（不问你要数据，也能直接开干）

4) 额外提醒：你数据“类别不均衡”，bbox loss 改再多也救不了分类长尾

给你一个最稳的“下一步建议”

Compartir este Q&A