YOLOv11没有改进前P,R,MAP50,MAP50-95的结果分别如下:0.923,0.828,...
Создано: 26 декабря 2025 г.
Отвечено с использованием GPT-5.2 Pro от Chat01
Создано: 26 декабря 2025 г.
Отвечено с использованием GPT-5.2 Pro от Chat01
YOLOv11没有改进前P,R,MAP50,MAP50-95的结果分别如下:0.923,0.828,0.89,0.719,我现在的代码如下:class SPD(nn.Module):
def init(self, scale=2):
super().init()
assert scale >= 2 and int(scale) == scale
self.scale = int(scale)
textdef forward(self, x): b, c, h, w = x.shape s = self.scale # 若你训练时可能遇到非整除尺寸(不常见,但做个保护),这里右/下补齐到能整除 pad_h = (s - h % s) % s pad_w = (s - w % s) % s if pad_h or pad_w: x = F.pad(x, (0, pad_w, 0, pad_h)) # (left,right,top,bottom) b, c, h, w = x.shape # [B,C,H,W] -> [B, C*s*s, H/s, W/s] x = x.view(b, c, h // s, s, w // s, s) x = x.permute(0, 1, 3, 5, 2, 4).contiguous() return x.view(b, c * s * s, h // s, w // s)
class EMA(nn.Module):
def init(self, channels, c2=None, factor=32):
super(EMA, self).init()
self.groups = factor
assert channels % factor == 0
assert channels // self.groups > 0
self.softmax = nn.Softmax(-1)
self.agp = nn.AdaptiveAvgPool2d((1, 1))
self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
self.pool_w = nn.AdaptiveAvgPool2d((1, None))
self.gn = nn.GroupNorm(channels // self.groups, channels // self.groups)
self.conv1x1 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=1, stride=1, padding=0)
self.conv3x3 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=3, stride=1, padding=1)
textdef forward(self, x): b, c, h, w = x.size() group_x = x.reshape(b * self.groups, -1, h, w) # b*g,c//g,h,w x_h = self.pool_h(group_x) x_w = self.pool_w(group_x).permute(0, 1, 3, 2) hw = self.conv1x1(torch.cat([x_h, x_w], dim=2)) x_h, x_w = torch.split(hw, [h, w], dim=2) x1 = self.gn(group_x * x_h.sigmoid() * x_w.permute(0, 1, 3, 2).sigmoid()) x2 = self.conv3x3(group_x) x11 = self.softmax(self.agp(x1).reshape(b * self.groups, -1, 1).permute(0, 2, 1)) x12 = x2.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw x21 = self.softmax(self.agp(x2).reshape(b * self.groups, -1, 1).permute(0, 2, 1)) x22 = x1.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw weights = (torch.matmul(x11, x12) + torch.matmul(x21, x22)).reshape(b * self.groups, 1, h, w) return (group_x * weights.sigmoid()).reshape(b, c, h, w)
class SPDConvLite(nn.Module):
"""
SPDConvLite + EMA
args 兼容你 YAML: [c2, k, s, ratio, ema_factor]
"""
def init(self, c1, c2, k=3, s=2, ratio=0.5, ema_factor=16, act=True):
super().init()
self.spd = SPD(scale=s) # 你已有的 SPD(或 space_to_depth/PixelUnshuffle)
cin = c1 * s * s
hidden = max(16, int(c2 * ratio))
textself.pw1 = Conv(cin, hidden, 1, 1, act=act) self.dw = Conv(hidden, hidden, k, 1, g=hidden, act=act) self.pw2 = Conv(hidden, c2, 1, 1, act=act) # EMA 注意力:作用在输出通道 c2 上 # 如果你的 EMA 定义是 EMA(channels, c2=None, factor=32),这里传 channels=c2 即可 self.attn = EMA(c2, factor=ema_factor) self.alpha=nn.Parameter(torch.zeros((1))) def forward(self, x): x = self.spd(x) x = self.pw1(x) x = self.dw(x) y = self.pw2(x) y_attn = self.attn(y) return y+self.alpha*(y_attn-y)
class MSCA(nn.Module):
def init(self, c1, c2, reduction=8):
super().init()
assert c1 == c2, "MSCAPlus 目前假设输入输出通道相同"
dim = c2
text# 1. 多尺度空间分支 # 分支0: 局部 5x5 深度卷积 self.branch0 = nn.Conv2d(dim, dim, 5, padding=2, groups=dim) # 分支1: 条带卷积 (1x7 -> 7x1) self.b1_1 = nn.Conv2d(dim, dim, (1, 7), padding=(0, 3), groups=dim) self.b1_2 = nn.Conv2d(dim, dim, (7, 1), padding=(3, 0), groups=dim) # 分支2: 3x3 空洞卷积, dilation=2 (中等感受野) self.branch2 = nn.Conv2d(dim, dim, 3, padding=2, dilation=2, groups=dim) # 分支3: 3x3 空洞卷积, dilation=3 (大感受野, 替代 21x21) self.branch3 = nn.Conv2d(dim, dim, 3, padding=3, dilation=3, groups=dim) # 2. 组卷积融合多尺度特征 # 4 个分支 concat → 通道数 4*dim,groups=4 保证参数 ≈ 原来的 1x1 conv self.fuse = nn.Conv2d(4 * dim, dim, kernel_size=1, groups=4, bias=False) self.edge=nn.Conv2d(dim, dim, 3, padding=1, groups=dim, bias=False) # 3. 通道注意力 (SE-like) hidden = max(dim // reduction, 4) self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc1 = nn.Conv2d(dim, hidden, kernel_size=1, bias=True) self.fc2 = nn.Conv2d(hidden, dim, kernel_size=1, bias=True) # 激活与门控 self.act = nn.SiLU() self.sigmoid = nn.Sigmoid() self.gamma_add = nn.Parameter(torch.zeros(1)) self.gamma_gate = nn.Parameter(torch.zeros(1)) def forward(self, x): identity = x # 多尺度空间分支 b0 = self.branch0(x) b1 = self.b1_1(x) b1 = self.b1_2(b1) b2 = self.branch2(x) b3 = self.branch3(x) # concat + 融合 ms = torch.cat([b0, b1, b2, b3], dim=1) # [B, 4C, H, W] ms = self.fuse(ms) # [B, C, H, W] edge=self.edge(x) ms=ms+edge # 通道注意力 ca = self.avg_pool(ms) # [B, C, 1, 1] ca = self.fc2(self.act(self.fc1(ca))) # [B, C, 1, 1] ca = self.sigmoid(ca) attn = ms * ca # 空间 + 通道联合 gate = self.sigmoid(attn) # [0,1] # 残差门控:避免特征被破坏性缩放 out = identity + self.gamma_add*attn + self.gamma_gate*identity*gate return out
class EMA(nn.Module):
def init(self, channels, c2=None, factor=32):
super(EMA, self).init()
self.groups = factor
assert channels // self.groups > 0
self.softmax = nn.Softmax(-1)
self.agp = nn.AdaptiveAvgPool2d((1, 1))
self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
self.pool_w = nn.AdaptiveAvgPool2d((1, None))
self.gn = nn.GroupNorm(channels // self.groups, channels // self.groups)
self.conv1x1 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=1, stride=1, padding=0)
self.conv3x3 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=3, stride=1, padding=1)
textdef forward(self, x): b, c, h, w = x.size() group_x = x.reshape(b * self.groups, -1, h, w) # b*g,c//g,h,w x_h = self.pool_h(group_x) x_w = self.pool_w(group_x).permute(0, 1, 3, 2) hw = self.conv1x1(torch.cat([x_h, x_w], dim=2)) x_h, x_w = torch.split(hw, [h, w], dim=2) x1 = self.gn(group_x * x_h.sigmoid() * x_w.permute(0, 1, 3, 2).sigmoid()) x2 = self.conv3x3(group_x) x11 = self.softmax(self.agp(x1).reshape(b * self.groups, -1, 1).permute(0, 2, 1)) x12 = x2.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw x21 = self.softmax(self.agp(x2).reshape(b * self.groups, -1, 1).permute(0, 2, 1)) x22 = x1.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw weights = (torch.matmul(x11, x12) + torch.matmul(x21, x22)).reshape(b * self.groups, 1, h, w) return (group_x * weights.sigmoid()).reshape(b, c, h, w)我的配置文件如下:当我只增加MSCA模块时
nc: 5 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
n: [0.50, 0.25, 1024] # summary: 181 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs
s: [0.50, 0.50, 1024] # summary: 181 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs
m: [0.50, 1.00, 512] # summary: 231 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs
l: [1.00, 1.00, 512] # summary: 357 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs
x: [1.00, 1.50, 512] # summary: 357 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs
backbone:
head:
[-1, 1, nn.Upsample, [None, 2, "nearest"]]
[[-1, 6], 1, Concat, [1]] # cat backbone P4
[-1, 2, C3k2, [512, False]] # 13
[-1, 1, nn.Upsample, [None, 2, "nearest"]]
[[-1, 4], 1, Concat, [1]] # cat backbone P3
[-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]]
[[-1, 14], 1, Concat, [1]] # cat head P4
[-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]]
[[-1, 11], 1, Concat, [1]] # cat head P5
[-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
[[17, 20, 23], 1, Detect, [nc]] # Detect(P3, P4, P5)
代码结果如下:0.935,0.866,0.914,0.725,我现在用SPD的配置文件:
nc: 5 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
n: [0.50, 0.25, 1024] # summary: 181 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs
s: [0.50, 0.50, 1024] # summary: 181 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs
m: [0.50, 1.00, 512] # summary: 231 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs
l: [1.00, 1.00, 512] # summary: 357 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs
x: [1.00, 1.50, 512] # summary: 357 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs
backbone:
head:
[-1, 1, nn.Upsample, [None, 2, "nearest"]]
[[-1, 6], 1, Concat, [1]] # cat backbone P4
[-1, 2, C3k2, [512, False]] # 13
[-1, 1, nn.Upsample, [None, 2, "nearest"]]
[[-1, 4], 1, Concat, [1]] # cat backbone P3
[-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
[-1, 1, SPDConvLite, [256, 3, 2]]
[[-1, 13], 1, Concat, [1]]
[-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
[-1, 1, SPDConvLite, [512, 3, 2]]
[[-1, 10], 1, Concat, [1]]
[-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
[[16, 19, 22], 1, Detect, [nc]]
结果如下:0.959,0.865,0.91,0.725
但是当我两者同时添加后配置文件如下:# Parameters
nc: 5 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
n: [0.50, 0.25, 1024] # summary: 181 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs
s: [0.50, 0.50, 1024] # summary: 181 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs
m: [0.50, 1.00, 512] # summary: 231 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs
l: [1.00, 1.00, 512] # summary: 357 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs
x: [1.00, 1.50, 512] # summary: 357 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs
backbone:
head:
[-1, 1, nn.Upsample, [None, 2, "nearest"]]
[[-1, 6], 1, Concat, [1]] # cat backbone P4
[-1, 2, C3k2, [512, False]] # 13
[-1, 1, nn.Upsample, [None, 2, "nearest"]]
[[-1, 4], 1, Concat, [1]] # cat backbone P3
[-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
[-1, 1, SPDConvLite, [256, 3, 2]]
[[-1, 14], 1, Concat, [1]] # cat head P4
[-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
[-1, 1, SPDConvLite, [512, 3, 2]]
[[-1, 11], 1, Concat, [1]] # cat head P5
[-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
[[17, 20, 23], 1, Detect, [nc]] # Detect(P3, P4, P5)
代码结果如下:0.909,0.864,0.896,0.719,我现在还想把我的检测头的改进点融入进去,我现在的检测头的改进配置文件如下:# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
nc: 5 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
n: [0.50, 0.25, 1024] # summary: 181 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs
s: [0.50, 0.50, 1024] # summary: 181 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs
m: [0.50, 1.00, 512] # summary: 231 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs
l: [1.00, 1.00, 512] # summary: 357 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs
x: [1.00, 1.50, 512] # summary: 357 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs
backbone:
#head: