这是我阅读的一个关于动态上采样的文章,这个文章里介绍了动态上采样的方法和操作细节,这两个动态上采样哪...
Создано: 11 декабря 2025 г.
Отвечено с использованием GPT-5.1 Thinking от Chat01
Создано: 11 декабря 2025 г.
Отвечено с использованием GPT-5.1 Thinking от Chat01
这是我阅读的一个关于动态上采样的文章,这个文章里介绍了动态上采样的方法和操作细节,这两个动态上采样哪个参数量小和效果好呢,哪个模型更新呢,我想将这个动态上采样插入在Ultralytics yolov11的模型里来替换原始的yolov11的双线性上采样,理解吗,我尝试了DySample的静态和动态两个模型,发现静态对指标map50-95是下降,但是其他指标都是上升,他的代码如下:def normal_init(module, mean=0, std=1, bias=0):
if hasattr(module, 'weight') and module.weight is not None:
nn.init.normal_(module.weight, mean, std)
if hasattr(module, 'bias') and module.bias is not None:
nn.init.constant_(module.bias, bias)
def constant_init(module, val, bias=0):
if hasattr(module, 'weight') and module.weight is not None:
nn.init.constant_(module.weight, val)
if hasattr(module, 'bias') and module.bias is not None:
nn.init.constant_(module.bias, bias)
class DySample(nn.Module):
def init(self, in_channels, scale=2, style='lp', groups=4, dyscope=False):
super().init()
self.scale = scale
self.style = style
self.groups = groups
assert style in ['lp', 'pl']
if style == 'pl':
assert in_channels >= scale ** 2 and in_channels % scale ** 2 == 0
assert in_channels >= groups and in_channels % groups == 0
textif style == 'pl': in_channels = in_channels // scale ** 2 out_channels = 2 * groups else: out_channels = 2 * groups * scale ** 2 self.offset = nn.Conv2d(in_channels, out_channels, 1) normal_init(self.offset, std=0.001) if dyscope: self.scope = nn.Conv2d(in_channels, out_channels, 1, bias=False) constant_init(self.scope, val=0.) self.register_buffer('init_pos', self._init_pos()) def _init_pos(self): h = torch.arange((-self.scale + 1) / 2, (self.scale - 1) / 2 + 1) / self.scale return torch.stack(torch.meshgrid([h, h])).transpose(1, 2).repeat(1, self.groups, 1).reshape(1, -1, 1, 1) def sample(self, x, offset): B, _, H, W = offset.shape offset = offset.view(B, 2, -1, H, W) coords_h = torch.arange(H) + 0.5 coords_w = torch.arange(W) + 0.5 coords = torch.stack(torch.meshgrid([coords_w, coords_h]) ).transpose(1, 2).unsqueeze(1).unsqueeze(0).type(x.dtype).to(x.device) normalizer = torch.tensor([W, H], dtype=x.dtype, device=x.device).view(1, 2, 1, 1, 1) coords = 2 * (coords + offset) / normalizer - 1 coords = F.pixel_shuffle(coords.view(B, -1, H, W), self.scale).view( B, 2, -1, self.scale * H, self.scale * W).permute(0, 2, 3, 4, 1).contiguous().flatten(0, 1) return F.grid_sample(x.reshape(B * self.groups, -1, H, W), coords, mode='bilinear', align_corners=False, padding_mode="border").view(B, -1, self.scale * H, self.scale * W) def forward_lp(self, x): if hasattr(self, 'scope'): offset = self.offset(x) * self.scope(x).sigmoid() * 0.5 + self.init_pos else: offset = self.offset(x) * 0.25 + self.init_pos return self.sample(x, offset) def forward_pl(self, x): x_ = F.pixel_shuffle(x, self.scale) if hasattr(self, 'scope'): offset = F.pixel_unshuffle(self.offset(x_) * self.scope(x_).sigmoid(), self.scale) * 0.5 + self.init_pos else: offset = F.pixel_unshuffle(self.offset(x_), self.scale) * 0.25 + self.init_pos return self.sample(x, offset) def forward(self, x): if self.style == 'pl': return self.forward_pl(x) return self.forward_lp(x)我找到的DLU代码是不是是下面呢,我应该复制哪段去注册和替换呢,yaml文件能直接插入吗:class CARAFENaiveFunction(Function): @staticmethod def symbolic(g, features, masks, kernel_size, group_size, scale_factor): return g.op( 'MMCVCARAFENaive', features, masks, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) @staticmethod def forward(ctx, features, masks, kernel_size, group_size, scale_factor): assert scale_factor >= 1 assert masks.size(1) == kernel_size * kernel_size * group_size assert masks.size(-1) == features.size(-1) * scale_factor assert masks.size(-2) == features.size(-2) * scale_factor assert features.size(1) % group_size == 0 assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 ctx.kernel_size = kernel_size ctx.group_size = group_size ctx.scale_factor = scale_factor ctx.feature_size = features.size() ctx.mask_size = masks.size() n, c, h, w = features.size() output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) ext_module.carafe_naive_forward( features, masks, output, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) if features.requires_grad or masks.requires_grad: ctx.save_for_backward(features, masks) return output @staticmethod def backward(ctx, grad_output): assert grad_output.is_cuda features, masks = ctx.saved_tensors kernel_size = ctx.kernel_size group_size = ctx.group_size scale_factor = ctx.scale_factor grad_input = torch.zeros_like(features) grad_masks = torch.zeros_like(masks) ext_module.carafe_naive_backward( grad_output.contiguous(), features, masks, grad_input, grad_masks, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) return grad_input, grad_masks, None, None, None
carafe_naive = CARAFENaiveFunction.apply
class CARAFENaive(Module):
textdef __init__(self, kernel_size, group_size, scale_factor): super(CARAFENaive, self).__init__() assert isinstance(kernel_size, int) and isinstance( group_size, int) and isinstance(scale_factor, int) self.kernel_size = kernel_size self.group_size = group_size self.scale_factor = scale_factor def forward(self, features, masks): return carafe_naive(features, masks, self.kernel_size, self.group_size, self.scale_factor)
class CARAFEFunction(Function):
text@staticmethod def symbolic(g, features, masks, kernel_size, group_size, scale_factor): return g.op( 'MMCVCARAFE', features, masks, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) @staticmethod def forward(ctx, features, masks, kernel_size, group_size, scale_factor): assert scale_factor >= 1 assert masks.size(1) == kernel_size * kernel_size * group_size assert masks.size(-1) == features.size(-1) * scale_factor assert masks.size(-2) == features.size(-2) * scale_factor assert features.size(1) % group_size == 0 assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 ctx.kernel_size = kernel_size ctx.group_size = group_size ctx.scale_factor = scale_factor ctx.feature_size = features.size() ctx.mask_size = masks.size() n, c, h, w = features.size() output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) routput = features.new_zeros(output.size(), requires_grad=False) rfeatures = features.new_zeros(features.size(), requires_grad=False) rmasks = masks.new_zeros(masks.size(), requires_grad=False) ext_module.carafe_forward( features, masks, rfeatures, routput, rmasks, output, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) if features.requires_grad or masks.requires_grad: ctx.save_for_backward(features, masks, rfeatures) return output @staticmethod def backward(ctx, grad_output): assert grad_output.is_cuda features, masks, rfeatures = ctx.saved_tensors kernel_size = ctx.kernel_size group_size = ctx.group_size scale_factor = ctx.scale_factor rgrad_output = torch.zeros_like(grad_output, requires_grad=False) rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False) rgrad_input = torch.zeros_like(features, requires_grad=False) rgrad_masks = torch.zeros_like(masks, requires_grad=False) grad_input = torch.zeros_like(features, requires_grad=False) grad_masks = torch.zeros_like(masks, requires_grad=False) ext_module.carafe_backward( grad_output.contiguous(), rfeatures, masks, rgrad_output, rgrad_input_hs, rgrad_input, rgrad_masks, grad_input, grad_masks, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) return grad_input, grad_masks, None, None, None
carafe = CARAFEFunction.apply
class CARAFE(Module):
""" CARAFE: Content-Aware ReAssembly of FEatures
textPlease refer to https://arxiv.org/abs/1905.02188 for more details. Args: kernel_size (int): reassemble kernel size group_size (int): reassemble group size scale_factor (int): upsample ratio Returns: upsampled feature map """ def __init__(self, kernel_size, group_size, scale_factor): super(CARAFE, self).__init__() assert isinstance(kernel_size, int) and isinstance( group_size, int) and isinstance(scale_factor, int) self.kernel_size = kernel_size self.group_size = group_size self.scale_factor = scale_factor def forward(self, features, masks): return carafe(features, masks, self.kernel_size, self.group_size, self.scale_factor)
@MODELS.register_module(name='dlu')
class DLUPack(nn.Module):
"""
Args:
channels (int): input feature channels
scale_factor (int): upsample ratio
up_kernel (int): kernel size of CARAFE op
up_group (int): group size of CARAFE op
encoder_kernel (int): kernel size of content encoder
encoder_dilation (int): dilation of content encoder
compressed_channels (int): output channels of channels compressor
textReturns: upsampled feature map """ def __init__(self, channels, scale_factor, up_kernel=5, up_group=1, encoder_kernel=3, encoder_dilation=1, compressed_channels=64): super(DLUPack, self).__init__() self.channels = channels self.scale_factor = scale_factor self.up_kernel = up_kernel self.up_group = up_group self.encoder_kernel = encoder_kernel self.encoder_dilation = encoder_dilation self.compressed_channels = compressed_channels self.channel_compressor = nn.Conv2d(channels, self.compressed_channels, 1) self.kernel_space_generator = nn.Conv2d( self.compressed_channels, self.up_kernel * self.up_kernel * self.up_group, self.encoder_kernel, padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2), dilation=self.encoder_dilation, groups=1) self.conv_offset = nn.Conv2d( self.compressed_channels, self.up_group * 2 * self.scale_factor * self.scale_factor, self.encoder_kernel, padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2), dilation=self.encoder_dilation, bias=True) self.init_weights() def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): xavier_init(m, distribution='uniform') normal_init(self.kernel_space_generator, std=0.001) self.conv_offset.weight.data.zero_() self.conv_offset.bias.data.zero_() def kernel_space_normalizer(self, mask): n, mask_c, h, w = mask.size() # use float division explicitly, # to void inconsistency while exporting to onnx mask_channel = int(mask_c / float(self.up_kernel**2)) mask = mask.view(n, mask_channel, -1, h, w) mask = F.softmax(mask, dim=2) mask = mask.view(n, mask_c, h, w).contiguous() return mask def kernel_space_expander(self, offset, mask): n, _, h, w = offset.size() offset = F.pixel_shuffle(offset, self.scale_factor) offset = offset.permute(0,2,3,1) offset[:,:,:,0] = offset[:,:,:,0] * 1/(w-1)*2 offset[:,:,:,1] = offset[:,:,:,1] * 1/(h-1)*2 new_h = torch.repeat_interleave(torch.linspace(-1, 1, h),self.scale_factor).view(-1, 1).repeat(1, self.scale_factor*w) new_w = torch.repeat_interleave(torch.linspace(-1, 1, w),self.scale_factor).repeat(self.scale_factor*h, 1) grid = torch.cat((new_w.unsqueeze(2), new_h.unsqueeze(2)), dim=2) grid = grid.unsqueeze(0) grid_ = grid.expand(n,-1,-1,-1) grid_ = grid_.to(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')) offset = grid_ + offset mask_ = F.grid_sample(mask, offset,padding_mode='border',align_corners=True) return mask_ def feature_reassemble(self, x, mask): x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor) return x def forward(self, x): compressed_x = self.channel_compressor(x) offset = self.conv_offset(compressed_x) mask = self.kernel_space_generator(compressed_x) mask = self.kernel_space_normalizer(mask) mask_ = self.kernel_space_expander(offset, mask) x = self.feature_reassemble(x, mask_) return x
先直接回答你几个关键点,然后再说怎么在 YOLO 里落地:
DySample(你贴的这个实现)
style='lp'、groups=4、scale=2 时:
out_channels = 2 * groups * scale^2 = 2*4*4 = 32offset 这一层是 Conv2d(256 → 32, k=1),参数量 ≈ 256×32 = 8,192dyscope=True,再加一个同规格 scope 卷积,再 +8,192完全是纯 PyTorch:conv + pixel_shuffle + grid_sample,不需要自定义 CUDA。(Learning to Upsample by Learning to Sample.pdf)
DLU(DLUPack)
DLU 的 kernel-generation 有 3 个主要卷积:(DLU.pdf)
channel_compressor: Conv2d(C → Cm, 1x1)kernel_space_generator: Conv2d(Cm → kup^2 * up_group, k=encoder_kernel)conv_offset: Conv2d(Cm → up_group * 2 * scale^2, k=encoder_kernel)论文建议配置:Cm=64, kup=5, encoder_kernel=3, up_group=1, scale=2。
那大概是:
channel_compressor: 256×64 ≈ 16Kkernel_space_generator: 64×25×3×3 ≈ 14.4Kconv_offset: 64×8×3×3 ≈ 4.6K合计 ≈ 35K 参数 / 每个上采样层,和论文 Table 2 里 “per op 35K params” 一致。(DLU.pdf)
结论:
- 在你这种典型 “2× FPN 上采样、C≈256” 场景下:
DySample < DLU < CARAFE(参数和 FLOPs 都是这个量级关系)。- 如果是 4×、8×、16× 这类大尺度上采样,DLU 相对 CARAFE 的优势会越来越明显,但和 DySample 谁更轻,还是要看具体配置。整体来看,DySample 仍然是最“抠门”的那个。
DLU 论文里(在 FPN 检测上):(DLU.pdf)
在 Libra R-CNN、Semantic FPN 等任务中,DLU 通常比 CARAFE 略好一点点,同时参数大幅减少,尤其是 8×、16× 这种大上采样时差距很夸张。(DLU.pdf)
DySample 论文里(在 SegFormer / Faster RCNN 等):(Learning to Upsample by Learning to Sample.pdf)
整体感觉:
时间上 DLU 更新,但“更新 ≠ 一定更好”,尤其你目前只需要 2× 上采样、又是 YOLO 这种实时模型,DySample 其实更合适、更容易落地。
你实现的这个 DySample 其实正是论文里说的 “static scope factor / dynamic scope factor” 两种版本:(Learning to Upsample by Learning to Sample.pdf)
offset = self.offset(x) * 0.25 + self.init_posoffset = self.offset(x) * self.scope(x).sigmoid() * 0.5 + self.init_pos静态版直接把 offset 范围缩放在一个固定区间(理论上能减轻采样重叠、减少边界伪影),动态版会再乘一个由特征算出来的 gate(scope),更灵活,但也更复杂一点。
你观测到:
静态版:mAP50-95 略降,但是其他指标都升
可能原因大致是:
实践建议:
dyscope=True)。groups(4 / 8)、style='lp'/'pl',这些在论文里也有做 ablation(一般分组数 4 或 8 是一个不错的折中)。(Learning to Upsample by Learning to Sample.pdf)是的,这段确实就是 DLU 官方框架里的核心代码结构:(DLU.pdf)
CARAFENaiveFunction / CARAFEFunction / CARAFE / carafe
ext_module.carafe_forward/backward —— 这是 mmcv/mmdet 里编译好的 CUDA 扩展。DLUPack(nn.Module)
channel_compressorkernel_space_generatorconv_offsetkernel_space_normalizerkernel_space_expander (里面用 grid_sample 做 GUIDED 采样)feature_reassemble 里调用 carafe(x, mask_, ...) 完成特征重组要在 YOLO 里用 DLU,最小闭包至少需要:
CARAFEFunction / carafe(以及依赖的 CUDA 扩展)DLUPack
光复制 DLUPack 一段是跑不动的。
不能。
Ultralytics 的 yaml 只能写:
yaml- [from, number, ModuleName, [args...]]
这里的 ModuleName 必须是 已经在 Python 代码里定义并注册好的类/函数名,yaml 里是不能直接粘贴 class DLUPack(nn.Module): ... 这种代码的。
所以,正确流程是:
在 Ultralytics 项目里新建一个 .py 文件,比如 ultralytics/nn/modules/dlu.py,把 CARAFEFunction、carafe、DLUPack 这些 class/函数放进去。
确保 ext_module 对应的 C++/CUDA 扩展能正常 import(例如 mmcv 已经安装并编译带 carafe 的版本);
在 ultralytics/nn/modules/__init__.py 里 from .dlu import DLUPack,并加入到 __all__;
最后在模型 yaml 里用类似:
yaml- [-1, 1, DLUPack, [channels, 2, 5, 1, 3, 1, 64]]
这种形式调用(具体参数按 DLUPack 的 init 来填)。
实际上,由于 需要编译 mmcv 的 carafe CUDA 扩展,这一整套流程对 Ultralytics 这种纯 PyTorch 工程来说,集成成本很高,不像 DySample 那么“拎包即用”。
下面给一套可操作的思路,你可以对照 Ultralytics 的代码结构稍微改一下(YOLOv8/9/11 的结构差不多):
新建文件,例如:ultralytics/nn/modules/dysample.py,把你贴的 DySample 类放进去,然后再加一个包装层,专门做 FPN 的上采样:
python# ulralytics/nn/modules/dysample.py import torch import torch.nn as nn import torch.nn.functional as F # 你原来的 DySample 类,略... class DySampleUp(nn.Module): """ YOLO FPN 专用的 DySample 上采样模块 c1: 输入通道 scale: 上采样倍率 (一般是 2) 这里假设上采样前后通道数不变 """ def __init__(self, c1, scale=2, style='lp', groups=4, dyscope=True): super().__init__() self.dysample = DySample( in_channels=c1, scale=scale, style=style, groups=groups, dyscope=dyscope ) def forward(self, x): return self.dysample(x)
注意:
- 确保
groups能整除in_channels,否则assert in_channels % groups == 0会炸。- 如果你有某些层通道比较小(比如 64),就把 groups 改小一点(1 或 2)。
然后在 ultralytics/nn/modules/__init__.py 里加入:
pythonfrom .dysample import DySampleUp __all__ = [ # ... 'DySampleUp', ]
找一找你 yolov11-*.yaml 里类似这样的层(FPN/Path Aggregation 那几层):
yaml# 伪代码形式,大概类似 - [-1, 1, nn.Upsample, [None, 2, 'bilinear']]
或者在 Python 版模型定义里看到:
pythonself.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)
你就把这些地方替换成我们刚才注册的模块,比如 yaml 里改成:
yaml# 假设某个尺度的特征通道数是 256 - [-1, 1, DySampleUp, [256, 2, 'lp', 4, True]]
然后把 DySampleUp.__init__ 的签名改成配合 yaml:
pythonclass DySampleUp(nn.Module): def __init__(self, c1, scale=2, style='lp', groups=4, dyscope=True): ...
Ultralytics 构建模型时会自动把 c1 设置成上一层输出通道数。
纯技术上是可行的,但门槛比 DySample 高很多:
你需要一个能用的 carafe CUDA op:
mmcv-full(要和 PyTorch/CUDA 版本对得上);DLU 的那大段代码基本都要搬进来:
CARAFEFunction / carafeDLUPack在 Ultralytics 里注册 DLUPack 类,然后在 yaml 里用类似:
yaml- [-1, 1, DLUPack, [channels, 2, 5, 1, 3, 1, 64]]
对应 __init__(channels, scale_factor, up_kernel, up_group, encoder_kernel, encoder_dilation, compressed_channels)。
如果你对 C++/CUDA 扩展编译不是很熟、又主要是做 YOLO 检测实验,我真心建议:
优先用 DySample(动态 scope 版),DLU 可以等之后有时间再慢慢折腾。
如果你愿意,可以把你当前的 yolov11.yaml 和你改后的 DySample 代码也贴一份,我可以帮你具体看看哪几个 upsample 层最适合替换、参数怎么配比较稳。
我完成了注册和yolov11的配置文件,为什么代码报错呢,我现在的配置文件如下,而且他也没有sl:# Ultralytics �� AGPL-3.0 License - https://ultralytics.com/license
nc: 5 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
n: [0.50, 0.25, 1024] # summary: 181 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs
s: [0.50, 0.50, 1024] # summary: 181 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs
m: [0.50, 1.00, 512] # summary: 231 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs
l: [1.00, 1.00, 512] # summary: 357 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs
x: [1.00, 1.50, 512] # summary: 357 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs
backbone:
head:
[[-1, 6], 1, Concat, [1]] # cat backbone P4
[-1, 2, C3k2, [512, False]] # 13
[-1, 1, DySample, [2, "lp"]]
[[-1, 4], 1, Concat, [1]] # cat backbone P3
[-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]]
[[-1, 14], 1, Concat, [1]] # cat head P4
[-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]]
[[-1, 11], 1, Concat, [1]] # cat head P5
[-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
[[17, 20, 23], 1, Detect, [nc]] # Detect(P3, P4, P5)
我也完成了注册,我的task相关代码如下:def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
"""
Parse a YOLO model.yaml dictionary into a PyTorch model.
textArgs: d (dict): Model dictionary. ch (int): Input channels. verbose (bool): Whether to print model details. Returns: (tuple): Tuple containing the PyTorch model and sorted list of output layers. """ import ast # Args legacy = True # backward compatibility for v3/v5/v8/v9 models max_channels = float("inf") nc, act, scales = (d.get(x) for x in ("nc", "activation", "scales")) depth, width, kpt_shape = (d.get(x, 1.0) for x in ("depth_multiple", "width_multiple", "kpt_shape")) if scales: scale = d.get("scale") if not scale: scale = tuple(scales.keys())[0] LOGGER.warning(f"no model scale passed. Assuming scale='{scale}'.") depth, width, max_channels = scales[scale] if act: Conv.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = torch.nn.SiLU() if verbose: LOGGER.info(f"{colorstr('activation:')} {act}") # print if verbose: LOGGER.info(f"\n{'':>3}{'from':>20}{'n':>3}{'params':>10} {'module':<45}{'arguments':<30}") ch = [ch] layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out base_modules = frozenset( { Classify, Conv, CBAM, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, C2fPSA, C2PSA, DWConv, Focus, BottleneckCSP, C1, C2, C2f, C3k2, RepNCSPELAN4, ELAN1, ADown, AConv, SPPELAN, C2fAttn, C3, C3TR, C3Ghost, torch.nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3, PSA, SCDown, C2fCIB, A2C2f, LSKBlock, DySample, MSCA, } ) repeat_modules = frozenset( # modules with 'repeat' arguments { BottleneckCSP, C1, C2, C2f, C3k2, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3, C2fPSA, C2fCIB, C2PSA, A2C2f, ECA, CoordAtt, SimAM, LSKBlock } ) for i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]): # from, number, module, args m = ( getattr(torch.nn, m[3:]) if "nn." in m else getattr(__import__("torchvision").ops, m[16:]) if "torchvision.ops." in m else globals()[m] ) # get module for j, a in enumerate(args): if isinstance(a, str): with contextlib.suppress(ValueError): args[j] = locals()[a] if a in locals() else ast.literal_eval(a) n = n_ = max(round(n * depth), 1) if n > 1 else n # depth gain if m in base_modules: c1, c2 = ch[f], args[0] if c2 != nc: # if c2 not equal to number of classes (i.e. for Classify() output) c2 = make_divisible(min(c2, max_channels) * width, 8) if m is C2fAttn: # set 1) embed channels and 2) num heads args[1] = make_divisible(min(args[1], max_channels // 2) * width, 8) args[2] = int(max(round(min(args[2], max_channels // 2 // 32)) * width, 1) if args[2] > 1 else args[2]) args = [c1, c2, *args[1:]] if m in repeat_modules: args.insert(2, n) # number of repeats n = 1 if m is C3k2: # for M/L/X sizes legacy = False if scale in "mlx": args[3] = True if m is A2C2f: legacy = False if scale in "lx": # for L/X sizes args.extend((True, 1.2)) if m is C2fCIB: legacy = False elif m is AIFI: args = [ch[f], *args] elif m in frozenset({HGStem, HGBlock}): c1, cm, c2 = ch[f], args[0], args[1] args = [c1, cm, c2, *args[2:]] if m is HGBlock: args.insert(4, n) # number of repeats n = 1 elif m is ResNetLayer: c2 = args[1] if args[3] else args[1] * 4 elif m is torch.nn.BatchNorm2d: args = [ch[f]] elif m is Concat: c2 = sum(ch[x] for x in f) elif m in frozenset( {Detect, WorldDetect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB, ImagePoolingAttn, v10Detect} ): args.append([ch[x] for x in f]) if m is Segment or m is YOLOESegment: args[2] = make_divisible(min(args[2], max_channels) * width, 8) if m in {Detect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB}: m.legacy = legacy elif m is RTDETRDecoder: # special case, channels arg must be passed in index 1 args.insert(1, [ch[x] for x in f]) elif m is CBLinear: c2 = args[0] c1 = ch[f] args = [c1, c2, *args[1:]] elif m is CBFuse: c2 = ch[f[-1]] elif m in frozenset({TorchVision, Index}): c2 = args[0] c1 = ch[f] args = [*args[1:]] elif m is DySample: c2=ch[f] args=[ch[f], *args] else: c2 = ch[f] m_ = torch.nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module t = str(m)[8:-2].replace("__main__.", "") # module type m_.np = sum(x.numel() for x in m_.parameters()) # number params m_.i, m_.f, m_.type = i, f, t # attach index, 'from' index, type if verbose: LOGGER.info(f"{i:>3}{str(f):>20}{n_:>3}{m_.np:10.0f} {t:<45}{str(args):<30}") # print save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist layers.append(m_) if i == 0: ch = [] ch.append(c2)Traceback (most recent call last):
File "train.py", line 17, in <module>
model = YOLO(model='ultralytics/cfg/models/11/yolo11n.yaml')
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/models/yolo/model.py", line 53, in init
super().init(model=model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 146, in init
self._new(model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 258, in _new
self.model = (model or self._smart_load("model"))(cfg_dict, verbose=verbose and RANK == -1) # build model
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 342, in init
self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 1453, in parse_model
c2 = make_divisible(min(c2, max_channels) * width, 8)
TypeError: '<' not supported between instances of 'int' and 'NoneType'
(Yolov11) jd@jd-x11dai-n:/media/jd/4997BB1603CFE2C4/lw/ultralytic$ python train.py
Dysmample:in_channels=256,groups=4,style=lp,scale=8
/home/jd/anaconda3/envs/Yolov11/lib/python3.8/site-packages/torch/functional.py:513: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3609.)
return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]
Dysmample:in_channels=128,groups=4,style=lp,scale=8
Traceback (most recent call last):
File "train.py", line 17, in <module>
model = YOLO(model='ultralytics/cfg/models/11/yolo11n.yaml')
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/models/yolo/model.py", line 53, in init
super().init(model=model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 146, in init
self._new(model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 258, in _new
self.model = (model or self._smart_load("model"))(cfg_dict, verbose=verbose and RANK == -1) # build model
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 342, in init
self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 1489, in parse_model
args.append([ch[x] for x in f])
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 1489, in <listcomp>
args.append([ch[x] for x in f])
IndexError: list index out of range
什么原因呢
return torch.nn.Sequential(*layers), sorted(save)代码报错如下:
我完成了注册和yolov11的配置文件,为什么代码报错呢,我现在的配置文件如下,而且他也没有sl:# Ultralytics �� AGPL-3.0 License - https://ultralytics.com/license
nc: 5 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolo11n.yaml' will call yolo11.yaml with scale 'n'
n: [0.50, 0.25, 1024] # summary: 181 layers, 2624080 parameters, 2624064 gradients, 6.6 GFLOPs
s: [0.50, 0.50, 1024] # summary: 181 layers, 9458752 parameters, 9458736 gradients, 21.7 GFLOPs
m: [0.50, 1.00, 512] # summary: 231 layers, 20114688 parameters, 20114672 gradients, 68.5 GFLOPs
l: [1.00, 1.00, 512] # summary: 357 layers, 25372160 parameters, 25372144 gradients, 87.6 GFLOPs
x: [1.00, 1.50, 512] # summary: 357 layers, 56966176 parameters, 56966160 gradients, 196.0 GFLOPs
backbone:
head:
[[-1, 6], 1, Concat, [1]] # cat backbone P4
[-1, 2, C3k2, [512, False]] # 13
[-1, 1, DySample, [2, "lp"]]
[[-1, 4], 1, Concat, [1]] # cat backbone P3
[-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]]
[[-1, 14], 1, Concat, [1]] # cat head P4
[-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]]
[[-1, 11], 1, Concat, [1]] # cat head P5
[-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
[[17, 20, 23], 1, Detect, [nc]] # Detect(P3, P4, P5)
我也完成了注册,我的task相关代码如下:def parse_model(d, ch, verbose=True): # model_dict, input_channels(3)
"""
Parse a YOLO model.yaml dictionary into a PyTorch model.
textArgs: d (dict): Model dictionary. ch (int): Input channels. verbose (bool): Whether to print model details. Returns: (tuple): Tuple containing the PyTorch model and sorted list of output layers. """ import ast # Args legacy = True # backward compatibility for v3/v5/v8/v9 models max_channels = float("inf") nc, act, scales = (d.get(x) for x in ("nc", "activation", "scales")) depth, width, kpt_shape = (d.get(x, 1.0) for x in ("depth_multiple", "width_multiple", "kpt_shape")) if scales: scale = d.get("scale") if not scale: scale = tuple(scales.keys())[0] LOGGER.warning(f"no model scale passed. Assuming scale='{scale}'.") depth, width, max_channels = scales[scale] if act: Conv.default_act = eval(act) # redefine default activation, i.e. Conv.default_act = torch.nn.SiLU() if verbose: LOGGER.info(f"{colorstr('activation:')} {act}") # print if verbose: LOGGER.info(f"\n{'':>3}{'from':>20}{'n':>3}{'params':>10} {'module':<45}{'arguments':<30}") ch = [ch] layers, save, c2 = [], [], ch[-1] # layers, savelist, ch out base_modules = frozenset( { Classify, Conv, CBAM, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, C2fPSA, C2PSA, DWConv, Focus, BottleneckCSP, C1, C2, C2f, C3k2, RepNCSPELAN4, ELAN1, ADown, AConv, SPPELAN, C2fAttn, C3, C3TR, C3Ghost, torch.nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3, PSA, SCDown, C2fCIB, A2C2f, LSKBlock, DySample, MSCA, } ) repeat_modules = frozenset( # modules with 'repeat' arguments { BottleneckCSP, C1, C2, C2f, C3k2, C2fAttn, C3, C3TR, C3Ghost, C3x, RepC3, C2fPSA, C2fCIB, C2PSA, A2C2f, ECA, CoordAtt, SimAM, LSKBlock } ) for i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]): # from, number, module, args m = ( getattr(torch.nn, m[3:]) if "nn." in m else getattr(__import__("torchvision").ops, m[16:]) if "torchvision.ops." in m else globals()[m] ) # get module for j, a in enumerate(args): if isinstance(a, str): with contextlib.suppress(ValueError): args[j] = locals()[a] if a in locals() else ast.literal_eval(a) n = n_ = max(round(n * depth), 1) if n > 1 else n # depth gain if m in base_modules: c1, c2 = ch[f], args[0] if c2 != nc: # if c2 not equal to number of classes (i.e. for Classify() output) c2 = make_divisible(min(c2, max_channels) * width, 8) if m is C2fAttn: # set 1) embed channels and 2) num heads args[1] = make_divisible(min(args[1], max_channels // 2) * width, 8) args[2] = int(max(round(min(args[2], max_channels // 2 // 32)) * width, 1) if args[2] > 1 else args[2]) args = [c1, c2, *args[1:]] if m in repeat_modules: args.insert(2, n) # number of repeats n = 1 if m is C3k2: # for M/L/X sizes legacy = False if scale in "mlx": args[3] = True if m is A2C2f: legacy = False if scale in "lx": # for L/X sizes args.extend((True, 1.2)) if m is C2fCIB: legacy = False elif m is AIFI: args = [ch[f], *args] elif m in frozenset({HGStem, HGBlock}): c1, cm, c2 = ch[f], args[0], args[1] args = [c1, cm, c2, *args[2:]] if m is HGBlock: args.insert(4, n) # number of repeats n = 1 elif m is ResNetLayer: c2 = args[1] if args[3] else args[1] * 4 elif m is torch.nn.BatchNorm2d: args = [ch[f]] elif m is Concat: c2 = sum(ch[x] for x in f) elif m in frozenset( {Detect, WorldDetect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB, ImagePoolingAttn, v10Detect} ): args.append([ch[x] for x in f]) if m is Segment or m is YOLOESegment: args[2] = make_divisible(min(args[2], max_channels) * width, 8) if m in {Detect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB}: m.legacy = legacy elif m is RTDETRDecoder: # special case, channels arg must be passed in index 1 args.insert(1, [ch[x] for x in f]) elif m is CBLinear: c2 = args[0] c1 = ch[f] args = [c1, c2, *args[1:]] elif m is CBFuse: c2 = ch[f[-1]] elif m in frozenset({TorchVision, Index}): c2 = args[0] c1 = ch[f] args = [*args[1:]] elif m is DySample: c2=ch[f] args=[ch[f], *args] else: c2 = ch[f] m_ = torch.nn.Sequential(*(m(*args) for _ in range(n))) if n > 1 else m(*args) # module t = str(m)[8:-2].replace("__main__.", "") # module type m_.np = sum(x.numel() for x in m_.parameters()) # number params m_.i, m_.f, m_.type = i, f, t # attach index, 'from' index, type if verbose: LOGGER.info(f"{i:>3}{str(f):>20}{n_:>3}{m_.np:10.0f} {t:<45}{str(args):<30}") # print save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1) # append to savelist layers.append(m_) if i == 0: ch = [] ch.append(c2)Traceback (most recent call last):
File "train.py", line 17, in <module>
model = YOLO(model='ultralytics/cfg/models/11/yolo11n.yaml')
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/models/yolo/model.py", line 53, in init
super().init(model=model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 146, in init
self._new(model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 258, in _new
self.model = (model or self._smart_load("model"))(cfg_dict, verbose=verbose and RANK == -1) # build model
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 342, in init
self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 1453, in parse_model
c2 = make_divisible(min(c2, max_channels) * width, 8)
TypeError: '<' not supported between instances of 'int' and 'NoneType'
(Yolov11) jd@jd-x11dai-n:/media/jd/4997BB1603CFE2C4/lw/ultralytic$ python train.py
Dysmample:in_channels=256,groups=4,style=lp,scale=8
/home/jd/anaconda3/envs/Yolov11/lib/python3.8/site-packages/torch/functional.py:513: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3609.)
return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]
Dysmample:in_channels=128,groups=4,style=lp,scale=8
Traceback (most recent call last):
File "train.py", line 17, in <module>
model = YOLO(model='ultralytics/cfg/models/11/yolo11n.yaml')
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/models/yolo/model.py", line 53, in init
super().init(model=model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 146, in init
self._new(model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 258, in _new
self.model = (model or self._smart_load("model"))(cfg_dict, verbose=verbose and RANK == -1) # build model
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 342, in init
self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose) # model, savelist
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 1489, in parse_model
args.append([ch[x] for x in f])
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 1489, in <listcomp>
args.append([ch[x] for x in f])
IndexError: list index out of range
什么原因呢
return torch.nn.Sequential(*layers), sorted(save)代码报错如下:
先说结论,然后一点点对照你贴的代码排错:
TypeError: '<' not supported between instances of 'int' and 'NoneType'报错位置:
pythonc2 = make_divisible(min(c2, max_channels) * width, 8)
这行只会在 if m in base_modules: 里面执行。
你又在 base_modules 里加了 DySample:
pythonbase_modules = frozenset( { ... LSKBlock, DySample, # ← 你加的 MSCA, } )
但 DySample 的第一个参数不是 out_channels,而是 scale,而且它本身也不该走“卷积类模块”的这套逻辑。
更关键的是:有些模块(比如原来的 nn.Upsample)在 yaml 里写的是 [None, 2, 'nearest'],
args[0] 就是 None,一旦这个模块被错误地当成 base_modules 去处理,就会变成:
pythonc1, c2 = ch[f], args[0] # c2 = None c2 = make_divisible(min(c2, max_channels) * width, 8) # -> min(None, 1024) : int 和 None 比较,直接 TypeError
所以第一次那个 TypeError,本质就是:
把 DySample(以及类似的模块)错误地丢进了 base_modules,
让解析器以为 args[0] 是“输出通道”,去做min(c2, max_channels),结果 c2 里有 None。
IndexError: list index out of range(出现在 Detect 上)报错位置:
pythonelif m in frozenset({Detect, WorldDetect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB, ImagePoolingAttn, v10Detect}): args.append([ch[x] for x in f])
你 yaml 里 Detect 这一行是:
yaml- [[17, 20, 23], 1, Detect, [nc]] # Detect(P3, P4, P5)
parse_model 在 构建第 i 层时,ch 里只存放了 0..(i-1) 层的输出通道数。
对 Detect 这一行来说,它自己的索引是 23,所以此时 ch 只有 index 0..22。
但你写的 from = [17, 20, 23]:
ch[23] 越界 → IndexError同时,从结构上看,你真正要拿来做检测的 P3/P4/P5 的输出层,还是:
也就是说,这一行 根本不应该改成 [17, 20, 23],应该保持原始的:
yaml- [[16, 19, 22], 1, Detect, [nc]] # Detect(P3, P4, P5)
你把 Detect 这行“跟着一起 +1”了,就直接踩到 ch 越界。
你现在的这段是:
pythonelif m is DySample: c2=ch[f] args=[ch[f], *args]
思路是对的:DySample 不改通道数,输出通道 c2 = 输入通道 c1,并且在 args 前面插入 in_channels。
但是有两个关键点要注意:
base_modules 里,否则永远先走 if m in base_modules: 分支,elif m is DySample: 这一段根本不会执行到(被短路了)。elif m is DySample: 必须放在其他 elif 分支之前不会被覆盖就行,现在位置 OK。所以你可以修改为:
base_modules 里删掉 DySamplepythonbase_modules = frozenset( { Classify, Conv, CBAM, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, C2fPSA, C2PSA, DWConv, Focus, BottleneckCSP, C1, C2, C2f, C3k2, RepNCSPELAN4, ELAN1, ADown, AConv, SPPELAN, C2fAttn, C3, C3TR, C3Ghost, torch.nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3, PSA, SCDown, C2fCIB, A2C2f, LSKBlock, # ❌ DySample 不要放这里 MSCA, } )
紧接着其它 elif 其中之一,加上:
pythonelif m is DySample: # DySample 不改变通道数:c2 = c1 c1 = ch[f] c2 = c1 # yaml 里你写的是 [2, "lp"],即 [scale, style] scale = args[0] if len(args) > 0 else 2 style = args[1] if len(args) > 1 else "lp" groups = args[2] if len(args) > 2 else 4 dyscope = args[3] if len(args) > 3 else False # DySample(in_channels, scale, style, groups, dyscope) args = [c1, scale, style, groups, dyscope]
注意这里 完全不走 make_divisible 那套逻辑,也不依赖 max_channels。
这样:
min(c2, max_channels) 就不会再碰到 DySample;DySample( in_channels = ch[f], scale=2, style="lp", groups=4, dyscope=False );你打印的日志就应该变成:
textDySample: in_channels=1024, groups=4, style=lp, scale=2 DySample: in_channels=512, groups=4, style=lp, scale=2 ...
如果你看到 scale=8 那种,基本就是参数顺序 / 解析错位了,按上面这样拼 args 就对。
你现在的 head:
yamlhead: - [-1, 1, DySample, [2, "lp"]] - [[-1, 6], 1, Concat, [1]] - [-1, 2, C3k2, [512, False]] # 13 - [-1, 1, DySample, [2, "lp"]] - [[-1, 4], 1, Concat, [1]] - [-1, 2, C3k2, [256, False]] # 16 (P3/8-small) - [-1, 1, Conv, [256, 3, 2]] - [[-1, 14], 1, Concat, [1]] # cat head P4 ← 你改成 14 - [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium) - [-1, 1, Conv, [512, 3, 2]] - [[-1, 11], 1, Concat, [1]] # cat head P5 ← 你改成 11 - [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large) - [[17, 20, 23], 1, Detect, [nc]] # ← 你也改了 from
其实除了把 nn.Upsample 换成 DySample 以外,其它 from 索引完全不需要动,保持原版就行。
推荐改回下面这样:
yamlhead: # 11: P5 -> P4 上采样 - [-1, 1, DySample, [2, "lp"]] - [[-1, 6], 1, Concat, [1]] # 12, cat backbone P4 - [-1, 2, C3k2, [512, False]] # 13, FPN P4 # 14: P4 -> P3 上采样 - [-1, 1, DySample, [2, "lp"]] - [[-1, 4], 1, Concat, [1]] # 15, cat backbone P3 - [-1, 2, C3k2, [256, False]] # 16, P3/8-small - [-1, 1, Conv, [256, 3, 2]] # 17, P3 -> P4 下采样 - [[-1, 13], 1, Concat, [1]] # 18, cat head P4 (注意这里是 13,不是 14) - [-1, 2, C3k2, [512, False]] # 19, P4/16-medium - [-1, 1, Conv, [512, 3, 2]] # 20, P4 -> P5 下采样 - [[-1, 10], 1, Concat, [1]] # 21, cat head P5 (注意这里是 10,不是 11) - [-1, 2, C3k2, [1024, True]] # 22, P5/32-large - [[16, 19, 22], 1, Detect, [nc]] # 23, Detect(P3, P4, P5)
这样:
i=23,ch 只到 22,不再访问 ch[23],IndexError 消失。你在 yaml 里定义的:
yamlscales: n: [0.50, 0.25, 1024] s: [0.50, 0.50, 1024] m: [0.50, 1.00, 512] l: [1.00, 1.00, 512] x: [1.00, 1.50, 512]
这是 YOLO11 官方的写法,规模只有 n/s/m/l/x 五种,本来就没有 sl 这个 key。
一般 YOLO('yolo11n.yaml') 会自动把 scale 设置为 'n',然后取 scales['n'],不会去找 sl,所以这个不用担心。
base_modules 里,但有单独的 elif m is DySample 分支。__init__ 里保留你的 debug 打印:
确认两层分别是pythonprint(f"DySample: in_channels={in_channels}, groups={groups}, style={style}, scale={scale}")
(1024, 2) 和 (512, 2) 这种组合。如果你跑完还有新的报错,把新的栈信息和你最新的 DySample 定义贴出来,我可以帮你继续对着改。
pl和lp有什么区别呢,哪个是静态,哪个是动态上采样呢,这个参数我后续怎么调节呢
先把几个关键词对上:
dyscope 这个布尔量控制所以:“哪个是静态、哪个是动态上采样”这个说法有点混了两个维度,我们拆开讲。
对应论文里的 LP-style 和 PL-style 两种 offset 生成风格。(Learning to Upsample by Learning to Sample.pdf)
style='lp':Linear + PixelShuffle(LP 风格)你的代码里:
pythondef forward_lp(self, x): if hasattr(self, 'scope'): offset = self.offset(x) * self.scope(x).sigmoid() * 0.5 + self.init_pos else: offset = self.offset(x) * 0.25 + self.init_pos return self.sample(x, offset)
流程是:
x 做 1×1 卷积:offset = Conv2d(in_channels, out_channels)out_channels = 2 * groups * scale^2(比如 C=256,g=4,s=2 时,out=32)sample() 里面用 pixel_shuffle + grid_sample 把这个 offset reshape 成 (2, sH, sW) 的采样网格。特点:
in_channels * 2 * groups * scale^2style='pl':PixelShuffle + Linear(PL 风格)你的代码里:
pythondef __init__(...): if style == 'pl': in_channels = in_channels // scale ** 2 out_channels = 2 * groups else: out_channels = 2 * groups * scale ** 2 self.offset = nn.Conv2d(in_channels, out_channels, 1)
pythondef forward_pl(self, x): x_ = F.pixel_shuffle(x, self.scale) # C -> C/s^2, H,W -> sH,sW if hasattr(self, 'scope'): offset = F.pixel_unshuffle( self.offset(x_) * self.scope(x_).sigmoid(), self.scale ) * 0.5 + self.init_pos else: offset = F.pixel_unshuffle(self.offset(x_), self.scale) * 0.25 + self.init_pos return self.sample(x, offset)
流程是:
x 做 pixel_shuffle,空间从 H×W 变成 sH×sW,通道从 C 变成 C / s^2Conv2d(C/s^2 → 2*groups)pixel_unshuffle 回到原来的低分辨率 offset 格式,交给 sample() 去用。特点:
(in_channels / scale^2) * 2 * groupspixel_shuffle/pixel_unshuffle,显存和一点点延时会上去一点都不是。
lp / pl 只决定 “offset 怎么算”,不是静态/动态。
在 DySample 里,“静态/动态”说的是 scope factor(偏移幅度的缩放因子):
O = 0.25 * linear(X) (Learning to Upsample by Learning to Sample.pdf)python# static offset = self.offset(x) * 0.25 + self.init_pos
O = 0.5 * sigmoid(linear1(X)) * linear2(X)(Learning to Upsample by Learning to Sample.pdf)python# dynamic offset = self.offset(x) * self.scope(x).sigmoid() * 0.5 + self.init_pos
scope(x).sigmoid() ∈ (0,1),乘上 0.5,等价于把 offset 范围从固定 0.25 换成一个 按位置变化的 [0, 0.5] 区间。而你在 __init__ 里是这样写的:
pythonclass DySample(nn.Module): def __init__(..., dyscope=False): ... self.offset = nn.Conv2d(...) ... if dyscope: self.scope = nn.Conv2d(...)
dyscope=False → 只用 offset 那个卷积 → 静态 scope factor(论文里的 “static scope factor” 版本)(Learning to Upsample by Learning to Sample.pdf)dyscope=True → 同时用 offset 和 scope 两个卷积 → 动态 scope factor(论文里的 “dynamic scope factor” 版本,即 DySample+ / S+)(Learning to Upsample by Learning to Sample.pdf)总结一下:
| 维度 | 选项 | 你代码里的参数 |
|---|---|---|
| offset 风格 | LP / PL | style='lp'/'pl' |
| scope 类型 | static / dynamic | dyscope=False/True |
所以:
style='lp', dyscope=False → DySample(LP + static)style='lp', dyscope=True → DySample+(LP + dynamic)style='pl', dyscope=False → DySample-S(PL + static)style='pl', dyscope=True → DySample-S+(PL + dynamic)四个都是“动态上采样”(都会根据特征生成 offset),只是设计细节不同。
结合论文的 ablation 和你的 YOLO 场景,可以这么玩:(Learning to Upsample by Learning to Sample.pdf)
建议先用:
style='lp', groups=4, dyscope=True
理由:
groups=4 是作者在 LP 风格下推荐的分组数,性能和稳定性都不错。(Learning to Upsample by Learning to Sample.pdf)dyscope=True(动态 scope)一般能带来 +0.1 左右 AP/mIoU 的小提升,代价是增加一个额外的 1×1 卷积,但参数量很小。(Learning to Upsample by Learning to Sample.pdf)如果你用的是 n/s 超轻量模型,担心算力/显存:
dyscope=False,也就是 静态 scope 版本,只用一个 offset 卷积,复杂度 minimal。dyscope=True 看看。style='pl'(PL 风格)在下面两种场景可以尝试 PL:
scale=2 的上采样,PL 风格的 offset 分支参数是 LP 的大约 1/16,在模型特别大的时候(比如很多 FPN 层都换成动态上采样)会有点收益。(Learning to Upsample by Learning to Sample.pdf)但注意 PL 有两个小坑:
in_channels % scale^2 == 0(你代码已经 assert 了),否则会直接报错;pixel_shuffle 和 pixel_unshuffle,前向和显存稍微重一点点(比 LP)(Learning to Upsample by Learning to Sample.pdf)假设你现在在 YOLO11 上做实验,可以这样试:
style='lp', groups=4, dyscope=Falsedyscope=Truestyle='pl',同时把 groups 改为 8(跟论文推荐一致);(Learning to Upsample by Learning to Sample.pdf)dyscope 决定:False 是静态 scope,True 是动态 scope;style='lp', groups=4,先试 dyscope=False,再视情况打开 dyscope=True 做对比。如果你愿意,可以把你现在的 DySample 定义(完整类)贴一次,我可以帮你标注出哪种组合对应论文里的 DySample / DySample+ / S / S+,顺便一起看看 groups 和 scale 的配置有没有潜在坑。
我不添加任何模块前的算法结果,p,r,map50,map50-95的结果如下:0.923,0.828,0.89,0.719,我使用style='lp', groups=4, dyscope=False的情况下结果如下:0.947,0.882,0.901,0.72,这个结果比style='lp', groups=4, dyscope=True,和style='pl', groups=4, dyscope=True,都好,那一步怎么办呢,我还想将这个模块进行一定改进,请结合所有的动态上采样的方法,请给出下一步的修改方案,在这个模块流程基础上还有优化空间吗,或者能借鉴DLU的一写思想进行优化吗,请给出,要确保能有一定提升
你这个结果其实**很正常,而且挺“对味”**的 😂:
style='lp', groups=4, dyscope=False):说明两件事:
dyscope=True)和 PL 风格在你这个检测场景里反而有点“过拟合抖动”,高 IoU 反而没有占便宜——这和论文在不同任务上的结论是对得上的:在检测上 LP+static 非常强,PL 和 dynamic 未必一定更好。(Learning to Upsample by Learning to Sample.pdf)下面我按你现在这个 “LP + static + groups=4” 作为基线,给你三个循序渐进的改进方向,每个方向都尽量控制复杂度,都是结合 DySample / DLU / CARAFE / FADE / SAPA 的思路来的。
先提前说一句:我没法“保证一定提升”,但这些改法都是针对你现在的问题(高 IoU 提升不明显)设计的,理论上是朝着 mAP50-95 方向去的,而且额外参数都很少,可以很快做对比实验。
DySample 论文里静态 scope 因子固定是 0.25:
这是在 SegFormer/Faster-RCNN 上通过表 2 的 ablation 找到的最优值。(Learning to Upsample by Learning to Sample.pdf)
但:
在当前 DySample 类里面加一个(每层一个的)可学习 scope 因子 α,但仍然限制在 [0, 0.5] 这个安全区间内:
pythonclass DySample(nn.Module): def __init__(self, in_channels, scale=2, style='lp', groups=4, dyscope=False, learnable_scope=False): super().__init__() ... self.learnable_scope = learnable_scope if learnable_scope: # 初始化成 sigmoid(logit_alpha) ≈ 0.25 init_alpha = 0.25 import math logit = math.log(init_alpha / (0.5 - init_alpha)) # 映射到 (0,0.5) self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float)) else: self.register_buffer('alpha', torch.tensor(0.25, dtype=torch.float)) ... def get_alpha(self, x): if self.learnable_scope: # 映射到 (0, 0.5) return 0.5 * torch.sigmoid(self.logit_alpha) else: return self.alpha def forward_lp(self, x): alpha = self.get_alpha(x) if hasattr(self, 'scope'): # 如果以后你想再试动态 scope,这里用 2*alpha 代替原来 0.5 offset = self.offset(x) * self.scope(x).sigmoid() * (2 * alpha) + self.init_pos else: # 这里用 alpha 代替 0.25 offset = self.offset(x) * alpha + self.init_pos return self.sample(x, offset)
然后 YOLO 里你现在的两层 DySample 就写成:
yaml- [-1, 1, DySample, [2, "lp", 4, False, True]] # scale, style, groups, dyscope, learnable_scope
预期效果:
- 不会改变“静态 scope”这一整体风格,只是让 0.25 变成数据驱动的最优值;
- 理论上更有利于在高 IoU(mAP75~95)附近找到最佳平衡点,mAP50-95 有望比现在的 0.720 再抬一点点。
看过你贴的 DLU 代码你应该注意到了,它在生成 kernel / mask 之前会做一套:
channel_compressor(1×1) +kernel_space_generator(3×3, dilation)本质上是一个局部内容编码器:在生成上采样核/权重之前,先用 3×3 卷积把局部空间信息揉一遍,这样比单纯的 1×1 更容易理解边界 / 纹理。
DySample 目前的 offset 生成只有一个 1×1 conv:
你可以照着 DLU 的思路,给 DySample 的 offset 分支也加一个轻量 content encoder,但仍然保持单输入、纯 PyTorch、用 grid_sample 的流程。(Learning to Upsample by Learning to Sample.pdf)
pythonclass DySampleV2(nn.Module): def __init__(self, in_channels, scale=2, style='lp', groups=4, dyscope=False, encoder_kernel=3, encoder_dilation=1, compressed_channels=None, learnable_scope=False): super().__init__() self.scale = scale self.style = style self.groups = groups if style == 'pl': assert in_channels >= scale ** 2 and in_channels % scale ** 2 == 0 assert in_channels >= groups and in_channels % groups == 0 # --------- DLU 风格的内容编码器 ---------- mid = compressed_channels or max(in_channels // 2, 16) self.compress = nn.Conv2d(in_channels, mid, 1) self.encoder = nn.Conv2d( mid, in_channels, # 保持通道数不变,后面仍然喂给 offset/scope 分支 encoder_kernel, padding=int((encoder_kernel - 1) * encoder_dilation / 2), dilation=encoder_dilation, groups=1, bias=True, ) if style == 'pl': in_for_offset = in_channels // scale ** 2 out_channels = 2 * groups else: in_for_offset = in_channels out_channels = 2 * groups * scale ** 2 self.offset = nn.Conv2d(in_for_offset, out_channels, 1) normal_init(self.offset, std=0.001) if dyscope: self.scope = nn.Conv2d(in_for_offset, out_channels, 1, bias=False) constant_init(self.scope, val=0.) self.learnable_scope = learnable_scope if learnable_scope: import math logit = math.log(0.25 / (0.5 - 0.25)) self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float)) else: self.register_buffer('alpha', torch.tensor(0.25, dtype=torch.float)) self.register_buffer('init_pos', self._init_pos()) def get_alpha(self): if self.learnable_scope: return 0.5 * torch.sigmoid(self.logit_alpha) else: return self.alpha def encode(self, x): x = self.compress(x) x = F.relu(x, inplace=True) x = self.encoder(x) return x def forward_lp(self, x): x_enc = self.encode(x) alpha = self.get_alpha() if hasattr(self, 'scope'): offset = self.offset(x_enc) * self.scope(x_enc).sigmoid() * (2 * alpha) + self.init_pos else: offset = self.offset(x_enc) * alpha + self.init_pos return self.sample(x, offset) def forward_pl(self, x): x_enc = self.encode(x) x_ = F.pixel_shuffle(x_enc, self.scale) alpha = self.get_alpha() if hasattr(self, 'scope'): offset = F.pixel_unshuffle(self.offset(x_) * self.scope(x_).sigmoid(), self.scale) * (2 * alpha) + self.init_pos else: offset = F.pixel_unshuffle(self.offset(x_), self.scale) * alpha + self.init_pos return self.sample(x, offset) def forward(self, x): if self.style == 'pl': return self.forward_pl(x) return self.forward_lp(x)
UL 实际用的时候,你可以先这样配:
encoder_kernel=3, encoder_dilation=2, compressed_channels=64直觉:
- 检测里的高 IoU 问题,多半是 FPN 顶层语义信息+边界信息没融合好;
- 用 3×3/dilated conv 做“内容编码器”,可以把更多局部上下文融进 offset,理论上更有利于边界附近的采样位置选择,从而提升 mAP75~95。
这是借鉴 FADE/SAPA 和 DLU 里“mask / gate” 的思路:
在 DySample 里加一个可选的 gate 分支:
pythonclass DySampleV2(nn.Module): def __init__(..., use_gate=False, gate_ratio=4, ...): ... self.use_gate = use_gate if use_gate: hidden = max(in_channels // gate_ratio, 8) self.gate = nn.Sequential( nn.Conv2d(in_channels, hidden, 1), nn.ReLU(inplace=True), nn.Conv2d(hidden, 1, 1) ) ... def forward(self, x): # 先算动态上采样 if self.style == 'pl': y_dyn = self.forward_pl(x) else: y_dyn = self.forward_lp(x) if self.use_gate: # 再算一个标准双线性上采样作为“稳定基线” y_bi = F.interpolate(x, scale_factor=self.scale, mode='bilinear', align_corners=False) g = torch.sigmoid(self.gate(x)) # shape: [B,1,H,W],再自动 broadcast # 残差方式融合:gate=0 → 纯 bilinear,gate=1 → 纯 DySample y = y_bi + g * (y_dyn - y_bi) return y return y_dyn
直观理解:
- 内部大面积平坦区域,网络可以学到 gate ≈ 0,多用稳定的 bilinear;
- 边界、纹理复杂区域,gate 会变大,更多地依赖 DySample 的动态采样;
- 这样对 高 IoU(精细定位) 更友好,因为不会在没必要的地方乱采样。
在 YOLO11 head 里,你可以只在 P4→P3 这一层打开 use_gate=True,顶层先不用 gate,这样:
如果你时间有限,我建议按这个顺序来做:
DySample + learnable_scope(方向 1)
style='lp', groups=4, dyscope=False, learnable_scope=True在 P5→P4 那一层换成 DySampleV2(方向 2),带内容编码器
encoder_kernel=3, encoder_dilation=2, compressed_channels=64, learnable_scope=True在 P4→P3 打开 gate(方向 3)
use_gate=True, gate_ratio=4如果你愿意,之后还可以再玩:
style='pl',只在高分辨率层(接近输入大小的那一层)用 PL-S 风格,其他层保持 LP;(Learning to Upsample by Learning to Sample.pdf)如果你把上面某个版本的完整 DySampleV2 代码和更新后的 yaml 再贴一次,我可以帮你逐行检查一下参数顺序和 shape,确保不会再出现那种很烦的构建期报错,然后你就可以专心看指标变化了。
给出每个方向的完整的代码,注意我要传递的参数只有head:
[[-1, 6], 1, Concat, [1]] # cat backbone P4
[-1, 2, C3k2, [512, False]] # 13
[-1, 1, DySample, [2, "lp"]]
OK,完全理解你的意思 👍:
DySample, [2, "lp"][c1, 2, "lp"],不再加多余参数DySample 类代码,DySample 类替换成哪个(一次只留一个)。下面 3 个方向都满足这个接口:
pythonclass DySample(nn.Module): def __init__(self, in_channels, scale=2, style='lp'): ...
也就是:
in_channels 由 parse_model 自动填scale=2、style="lp" 从 yaml 里来的如果你原来已经有 normal_init / constant_init,可以跳过这段。
pythonimport math import torch import torch.nn as nn import torch.nn.functional as F def normal_init(module, mean=0, std=1, bias=0): if hasattr(module, "weight") and module.weight is not None: nn.init.normal_(module.weight, mean, std) if hasattr(module, "bias") and module.bias is not None: nn.init.constant_(module.bias, bias) def constant_init(module, val, bias=0): if hasattr(module, "weight") and module.weight is not None: nn.init.constant_(module.weight, val) if hasattr(module, "bias") and module.bias is not None: nn.init.constant_(module.bias, bias)
特点:
style='lp'/'pl' 都支持groups 固定为 4,在类里面写死lp + groups=4 + dyscope=False,这个版本就是在此基础上“微调 α”pythonclass DySample(nn.Module): """ 版本1:LP/PL + learnable static scope - yaml 调用: DySample, [2, "lp"] - groups=4 固定 - 没有 scope 卷积,只有一个 learnable alpha (0,0.5) """ def __init__(self, in_channels, scale=2, style="lp"): super().__init__() self.scale = scale self.style = style self.groups = 4 # 固定 4 组 assert style in ["lp", "pl"] if style == "pl": assert in_channels >= scale**2 and in_channels % scale**2 == 0 assert in_channels >= self.groups and in_channels % self.groups == 0 if style == "pl": conv_in_channels = in_channels // (scale**2) out_channels = 2 * self.groups else: conv_in_channels = in_channels out_channels = 2 * self.groups * (scale**2) # offset 卷积 self.offset = nn.Conv2d(conv_in_channels, out_channels, 1) normal_init(self.offset, std=0.001) # learnable 的静态 scope 因子 alpha ∈ (0, 0.5) init_alpha = 0.25 logit = math.log(init_alpha / (0.5 - init_alpha)) self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float)) # 初始采样位置 self.register_buffer("init_pos", self._init_pos()) # 获取 alpha,范围 (0, 0.5) def _get_alpha(self): return 0.5 * torch.sigmoid(self.logit_alpha) def _init_pos(self): # [1, 2*groups*scale^2, 1, 1] h = torch.arange( (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1, dtype=torch.float32 ) # meshgrid,注意 indexing 参数避免 warning yy, xx = torch.meshgrid(h, h, indexing="ij") # (s, s) pos = torch.stack([xx, yy], dim=0) # (2, s, s) pos = pos.transpose(1, 2).repeat(1, self.groups, 1) # (2, s*groups, s) pos = pos.reshape(1, -1, 1, 1) # (1, 2*groups*s, 1, 1) return pos def sample(self, x, offset): B, _, H, W = offset.shape # [B, 2, groups*scale^2, H, W] offset = offset.view(B, 2, -1, H, W) coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5 coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5 # coords_w: width, coords_h: height grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij") # (W, H) coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2) # (2, H, W) coords = coords.unsqueeze(0).unsqueeze(1) # (1,1,2,H,W) normalizer = torch.tensor( [W, H], dtype=x.dtype, device=x.device ).view(1, 2, 1, 1, 1) coords = 2 * (coords + offset) / normalizer - 1 # 归一化到 [-1,1] # 像原论文一样,用 pixel_shuffle 展开到高分辨率采样点 coords = F.pixel_shuffle( coords.view(B, -1, H, W), self.scale ) # (B, 2*groups, sH, sW) coords = ( coords.view(B, 2, -1, self.scale * H, self.scale * W) .permute(0, 2, 3, 4, 1) .contiguous() .flatten(0, 1) ) # (B*groups, sH, sW, 2) # 对每个 group 做 grid_sample return F.grid_sample( x.reshape(B * self.groups, -1, H, W), coords, mode="bilinear", align_corners=False, padding_mode="border", ).view(B, -1, self.scale * H, self.scale * W) def forward_lp(self, x): alpha = self._get_alpha() offset = self.offset(x) * alpha + self.init_pos.to( x.dtype ) # broadcast 到 [B, C, H, W] return self.sample(x, offset) def forward_pl(self, x): x_ = F.pixel_shuffle(x, self.scale) alpha = self._get_alpha() offset = ( F.pixel_unshuffle(self.offset(x_), self.scale) * alpha + self.init_pos.to(x.dtype) ) return self.sample(x, offset) def forward(self, x): if self.style == "pl": return self.forward_pl(x) return self.forward_lp(x)
特点:
pythonclass DySample(nn.Module): """ 版本2:版本1 + DLU 风格内容编码器 - yaml 调用: DySample, [2, "lp"] - groups=4 固定 - learnable static scope alpha - 在 offset 之前加: 1x1 通道压缩 + 3x3 编码 """ def __init__(self, in_channels, scale=2, style="lp"): super().__init__() self.scale = scale self.style = style self.groups = 4 assert style in ["lp", "pl"] if style == "pl": assert in_channels >= scale**2 and in_channels % scale**2 == 0 assert in_channels >= self.groups and in_channels % self.groups == 0 # ---- DLU 风格内容编码器:compress + 3x3 encoder ---- mid_channels = max(in_channels // 2, 16) self.compress = nn.Conv2d(in_channels, mid_channels, 1) self.encoder = nn.Conv2d( mid_channels, in_channels, kernel_size=3, padding=1, dilation=1, bias=True, ) # offset 分支输入通道数(可能是 pl 风格下的 C/s^2) if style == "pl": conv_in_channels = in_channels // (scale**2) out_channels = 2 * self.groups else: conv_in_channels = in_channels out_channels = 2 * self.groups * (scale**2) self.offset = nn.Conv2d(conv_in_channels, out_channels, 1) normal_init(self.offset, std=0.001) # learnable alpha init_alpha = 0.25 logit = math.log(init_alpha / (0.5 - init_alpha)) self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float)) self.register_buffer("init_pos", self._init_pos()) def _get_alpha(self): return 0.5 * torch.sigmoid(self.logit_alpha) def _init_pos(self): h = torch.arange( (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1, dtype=torch.float32 ) yy, xx = torch.meshgrid(h, h, indexing="ij") pos = torch.stack([xx, yy], dim=0) pos = pos.transpose(1, 2).repeat(1, self.groups, 1) pos = pos.reshape(1, -1, 1, 1) return pos def sample(self, x, offset): B, _, H, W = offset.shape offset = offset.view(B, 2, -1, H, W) coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5 coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5 grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij") coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2) coords = coords.unsqueeze(0).unsqueeze(1) normalizer = torch.tensor( [W, H], dtype=x.dtype, device=x.device ).view(1, 2, 1, 1, 1) coords = 2 * (coords + offset) / normalizer - 1 coords = F.pixel_shuffle( coords.view(B, -1, H, W), self.scale ) # (B, 2*groups, sH, sW) coords = ( coords.view(B, 2, -1, self.scale * H, self.scale * W) .permute(0, 2, 3, 4, 1) .contiguous() .flatten(0, 1) ) return F.grid_sample( x.reshape(B * self.groups, -1, H, W), coords, mode="bilinear", align_corners=False, padding_mode="border", ).view(B, -1, self.scale * H, self.scale * W) def encode(self, x): x1 = self.compress(x) x1 = F.relu(x1, inplace=True) x1 = self.encoder(x1) return x1 def _forward_lp_core(self, x): x_enc = self.encode(x) alpha = self._get_alpha() offset = self.offset(x_enc) * alpha + self.init_pos.to(x.dtype) return self.sample(x, offset) def _forward_pl_core(self, x): x_enc = self.encode(x) x_ = F.pixel_shuffle(x_enc, self.scale) alpha = self._get_alpha() offset = ( F.pixel_unshuffle(self.offset(x_), self.scale) * alpha + self.init_pos.to(x.dtype) ) return self.sample(x, offset) def forward(self, x): if self.style == "pl": return self._forward_pl_core(x) return self._forward_lp_core(x)
特点:
x,输出一个 [B,1,H,W] 的图,再插值到 [B,1,sH,sW]y = y_bilinear + g * (y_dynamic - y_bilinear)
pythonclass DySample(nn.Module): """ 版本3:版本2 + gate(在 bilinear 与 dynamic 之间做自适应融合) - yaml 调用: DySample, [2, "lp"] - groups=4 固定 - learnable static scope alpha - DLU 风格内容编码器 - gate: y = y_bi + g * (y_dyn - y_bi) """ def __init__(self, in_channels, scale=2, style="lp"): super().__init__() self.scale = scale self.style = style self.groups = 4 assert style in ["lp", "pl"] if style == "pl": assert in_channels >= scale**2 and in_channels % scale**2 == 0 assert in_channels >= self.groups and in_channels % self.groups == 0 # ---- DLU 风格内容编码器 ---- mid_channels = max(in_channels // 2, 16) self.compress = nn.Conv2d(in_channels, mid_channels, 1) self.encoder = nn.Conv2d( mid_channels, in_channels, kernel_size=3, padding=1, dilation=1, bias=True, ) # offset 分支 if style == "pl": conv_in_channels = in_channels // (scale**2) out_channels = 2 * self.groups else: conv_in_channels = in_channels out_channels = 2 * self.groups * (scale**2) self.offset = nn.Conv2d(conv_in_channels, out_channels, 1) normal_init(self.offset, std=0.001) # learnable alpha init_alpha = 0.25 logit = math.log(init_alpha / (0.5 - init_alpha)) self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float)) # gate 分支:从 x 生成 [B,1,H,W] gate hidden = max(in_channels // 4, 8) self.gate_conv1 = nn.Conv2d(in_channels, hidden, 1) self.gate_conv2 = nn.Conv2d(hidden, 1, 1) self.register_buffer("init_pos", self._init_pos()) def _get_alpha(self): return 0.5 * torch.sigmoid(self.logit_alpha) def _init_pos(self): h = torch.arange( (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1, dtype=torch.float32 ) yy, xx = torch.meshgrid(h, h, indexing="ij") pos = torch.stack([xx, yy], dim=0) pos = pos.transpose(1, 2).repeat(1, self.groups, 1) pos = pos.reshape(1, -1, 1, 1) return pos def sample(self, x, offset): B, _, H, W = offset.shape offset = offset.view(B, 2, -1, H, W) coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5 coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5 grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij") coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2) coords = coords.unsqueeze(0).unsqueeze(1) normalizer = torch.tensor( [W, H], dtype=x.dtype, device=x.device ).view(1, 2, 1, 1, 1) coords = 2 * (coords + offset) / normalizer - 1 coords = F.pixel_shuffle( coords.view(B, -1, H, W), self.scale ) # (B, 2*groups, sH, sW) coords = ( coords.view(B, 2, -1, self.scale * H, self.scale * W) .permute(0, 2, 3, 4, 1) .contiguous() .flatten(0, 1) ) return F.grid_sample( x.reshape(B * self.groups, -1, H, W), coords, mode="bilinear", align_corners=False, padding_mode="border", ).view(B, -1, self.scale * H, self.scale * W) def encode(self, x): x1 = self.compress(x) x1 = F.relu(x1, inplace=True) x1 = self.encoder(x1) return x1 # 动态分支核心(不含 gate) def _forward_lp_core(self, x): x_enc = self.encode(x) alpha = self._get_alpha() offset = self.offset(x_enc) * alpha + self.init_pos.to(x.dtype) return self.sample(x, offset) def _forward_pl_core(self, x): x_enc = self.encode(x) x_ = F.pixel_shuffle(x_enc, self.scale) alpha = self._get_alpha() offset = ( F.pixel_unshuffle(self.offset(x_), self.scale) * alpha + self.init_pos.to(x.dtype) ) return self.sample(x, offset) def forward(self, x): # 先算动态上采样 if self.style == "pl": y_dyn = self._forward_pl_core(x) else: y_dyn = self._forward_lp_core(x) # 再算一个普通 bilinear 作为基线 y_bi = F.interpolate( x, scale_factor=self.scale, mode="bilinear", align_corners=False ) # gate from x -> [B,1,H,W] -> upsample 到 [B,1,sH,sW] g = self.gate_conv1(x) g = F.relu(g, inplace=True) g = self.gate_conv2(g) g = torch.sigmoid(g) # [B,1,H,W] g_up = F.interpolate( g, scale_factor=self.scale, mode="bilinear", align_corners=False ) # [B,1,sH,sW] # 残差融合:gate=0 时等于 bilinear,gate=1 时等于 DySample y = y_bi + g_up * (y_dyn - y_bi) return y
yaml 保持不动(就你发的那段):
yamlhead: - [-1, 1, DySample, [2, "lp"]] - [[-1, 6], 1, Concat, [1]] - [-1, 2, C3k2, [512, False]] - [-1, 1, DySample, [2, "lp"]] - [[-1, 4], 1, Concat, [1]] - [-1, 2, C3k2, [256, False]]
parse_model 里 DySample 分支可以保持你原来的简单写法(只要不放进 base_modules 就行):
pythonelif m is DySample: c1 = ch[f] c2 = c1 scale = args[0] if len(args) > 0 else 2 style = args[1] if len(args) > 1 else "lp" args = [c1, scale, style]
三个版本里 同一时间只能保留一个 class DySample,
换实验就把对应的版本覆盖掉之前的。
你可以先从 版本1 开始,因为它离你目前“最优结果”的结构最近,改动最小;
如果 mAP50-95 有提升,再试版本2、3 看看能不能继续往上抬。
如果你跑完某个版本的日志(特别是每层 DySample 的 in_channels、shape 有输出)想一起检查,我们可以再把 forward 里的 debug print 打开,一起对一下有没有哪里 shape 不对。
版本1,2,3都有报错如下的情况,如何解决呢:Traceback (most recent call last):
File "train.py", line 17, in <module>
model = YOLO(model='ultralytics/cfg/models/11/yolo11n.yaml')
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/models/yolo/model.py", line 53, in init
super().init(model=model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 146, in init
self._new(model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 258, in _new
self.model = (model or self._smart_load("model"))(cfg_dict, verbose=verbose and RANK == -1) # build model
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 359, in init
m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))]) # forward
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 357, in _forward
return self.forward(x)[0] if isinstance(m, (Segment, YOLOESegment, Pose, OBB)) else self.forward(x)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 122, in forward
return self.predict(x, *args, **kwargs)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 140, in predict
return self._predict_once(x, profile, visualize, embed)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 161, in _predict_once
x = m(x) # run
File "/home/jd/anaconda3/envs/Yolov11/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/jd/anaconda3/envs/Yolov11/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/modules/block.py", line 203, in forward
y_dyn = self._forward_lp_core(x)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/modules/block.py", line 186, in _forward_lp_core
return self.sample(x, offset)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/modules/block.py", line 155, in sample
coords = 2 * (coords + offset) / normalizer - 1
RuntimeError: The size of tensor a (2) must match the size of tensor b (16) at non-singleton dimension 2
(Yolov11) jd@jd-x11dai-n:/media/jd/4997BB1603CFE2C4/lw/ultralytic$ python train.py
Traceback (most recent call last):
File "train.py", line 17, in <module>
model = YOLO(model='ultralytics/cfg/models/11/yolo11n.yaml')
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/models/yolo/model.py", line 53, in init
super().init(model=model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 146, in init
self._new(model, task=task, verbose=verbose)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/engine/model.py", line 258, in _new
self.model = (model or self._smart_load("model"))(cfg_dict, verbose=verbose and RANK == -1) # build model
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 359, in init
m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))]) # forward
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 357, in _forward
return self.forward(x)[0] if isinstance(m, (Segment, YOLOESegment, Pose, OBB)) else self.forward(x)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 122, in forward
return self.predict(x, *args, **kwargs)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 140, in predict
return self._predict_once(x, profile, visualize, embed)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/tasks.py", line 161, in _predict_once
x = m(x) # run
File "/home/jd/anaconda3/envs/Yolov11/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/jd/anaconda3/envs/Yolov11/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/modules/block.py", line 184, in forward
return self.forward_lp(x)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/modules/block.py", line 170, in forward_lp
return self.sample(x, offset)
File "/media/jd/4997BB1603CFE2C4/lw/ultralytic/ultralytics/nn/modules/block.py", line 143, in sample
coords = 2 * (coords + offset) / normalizer - 1 # 归一化到 [-1,1]
RuntimeError: The size of tensor a (2) must match the size of tensor b (16) at non-singleton dimension 2
这个报错其实很精准地在骂我 😅:
The size of tensor a (2) must match the size of tensor b (16) at non-singleton dimension 2
意思就是:在
pythoncoords = 2 * (coords + offset) / normalizer - 1
这里,coords 和 offset 在 第 2 维(下标 2)一个是 2,一个是 16,不能广播。
根本原因:
我在给你写新版本的时候,把原论文实现里 unsqueeze 的顺序改错了,导致 coords 形状从原来的 (1, 2, 1, H, W) 变成了 (1, 1, 2, H, W),和 offset (B, 2, 16, H, W) 对不上。
解决办法:直接把 _init_pos 和 sample() 完全恢复成论文原版那一套,错误就没了。
我们只在 offset 生成那块做改动,不动 meshgrid 和 reshape。
下面我给你 三个版本的 DySample 完整代码,都满足:
构造函数只有: __init__(self, in_channels, scale=2, style='lp')
yaml 保持不变:
yamlhead: - [-1, 1, DySample, [2, "lp"]] - [[-1, 6], 1, Concat, [1]] - [-1, 2, C3k2, [512, False]] - [-1, 1, DySample, [2, "lp"]] - [[-1, 4], 1, Concat, [1]] - [-1, 2, C3k2, [256, False]]
parse_model 里 DySample 分支写成:
pythonelif m is DySample: c1 = ch[f] c2 = c1 scale = args[0] if len(args) > 0 else 2 style = args[1] if len(args) > 1 else "lp" args = [c1, scale, style]
_init_pos 和 sample() 完全照论文原始实现(只多加了 dtype / device 处理,不改维度顺序),这样就不会再出现那个 2 vs 16 的形状冲突。
一次只能保留一个
class DySample,你要试版本 1/2/3 时,把原来的DySample整个删掉,换成下面对应的那一段。
pythonimport math import torch import torch.nn as nn import torch.nn.functional as F def normal_init(module, mean=0, std=1, bias=0): if hasattr(module, "weight") and module.weight is not None: nn.init.normal_(module.weight, mean, std) if hasattr(module, "bias") and module.bias is not None: nn.init.constant_(module.bias, bias) def constant_init(module, val, bias=0): if hasattr(module, "weight") and module.weight is not None: nn.init.constant_(module.weight, val) if hasattr(module, "bias") and module.bias is not None: nn.init.constant_(module.bias, bias)
对应论文里 LP/PL + static scope factor,原来写死 0.25,现在变成可学习 α∈(0,0.5)。
其它逻辑完全跟你之前效果最好的版本一样,groups=4,没有动态 scope 卷积。
pythonclass DySample(nn.Module): """ 版本1:LP/PL + learnable static scope (没有 scope 卷积) - YOLO yaml: DySample, [2, "lp"] - groups = 4 固定写死在类内部 """ def __init__(self, in_channels, scale=2, style="lp"): super().__init__() self.scale = scale self.style = style self.groups = 4 assert style in ["lp", "pl"] if style == "pl": assert in_channels >= scale ** 2 and in_channels % (scale ** 2) == 0 conv_in_channels = in_channels // (scale ** 2) out_channels = 2 * self.groups else: conv_in_channels = in_channels out_channels = 2 * self.groups * (scale ** 2) # offset 卷积 self.offset = nn.Conv2d(conv_in_channels, out_channels, 1) normal_init(self.offset, std=0.001) # learnable 的静态 scope 因子 alpha ∈ (0, 0.5),初始化在 0.25 init_alpha = 0.25 logit = math.log(init_alpha / (0.5 - init_alpha)) self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float)) # 初始采样位置(完全照原始实现) self.register_buffer("init_pos", self._init_pos()) # ========== 这两个函数完全保持原版形状逻辑 ========== def _init_pos(self): # h: [- (s-1)/2, ..., (s-1)/2] / s h = torch.arange( (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1 ) / self.scale # (s,) # torch.meshgrid([h, h]) -> (2, s, s) pos = torch.stack(torch.meshgrid([h, h])).transpose(1, 2) # (2, s, s) pos = pos.repeat(1, self.groups, 1) # (2, s*groups, s) pos = pos.reshape(1, -1, 1, 1) # (1, 2*groups*s, 1, 1) return pos def sample(self, x, offset): """ x: [B, C, H, W] offset: [B, 2*groups*s^2, H, W] """ B, _, H, W = offset.shape offset = offset.view(B, 2, -1, H, W) # [B, 2, groups*s^2, H, W] coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5 coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5 # 原版写法,注意 unsqueeze 顺序不能改! coords = torch.stack(torch.meshgrid([coords_w, coords_h])) # (2, W, H) coords = ( coords.transpose(1, 2) # (2, H, W) .unsqueeze(1) # (2, 1, H, W) .unsqueeze(0) # (1, 2, 1, H, W) .to(device=x.device, dtype=x.dtype) ) normalizer = torch.tensor( [W, H], dtype=x.dtype, device=x.device ).view(1, 2, 1, 1, 1) # 这里之前报错,就是因为 coords 维度写坏了,现在恢复原版 coords = 2 * (coords + offset) / normalizer - 1 # [B, 2, groups*s^2, H, W] # 展开到高分辨率采样点 coords = F.pixel_shuffle( coords.view(B, -1, H, W), self.scale ).view( B, 2, -1, self.scale * H, self.scale * W ).permute( 0, 2, 3, 4, 1 ).contiguous().flatten( 0, 1 ) # [B*groups, sH, sW, 2] # 每个 group 做 grid_sample return F.grid_sample( x.reshape(B * self.groups, -1, H, W), coords, mode="bilinear", align_corners=False, padding_mode="border", ).view(B, -1, self.scale * H, self.scale * W) # ===================================================== def _get_alpha(self): # alpha ∈ (0, 0.5) return 0.5 * torch.sigmoid(self.logit_alpha) def forward_lp(self, x): alpha = self._get_alpha() # conv_in_channels = in_channels, 所以可以直接 self.offset(x) offset = self.offset(x) * alpha + self.init_pos.to(x.dtype) return self.sample(x, offset) def forward_pl(self, x): # PL 风格:先像素重排到高分辨率再 conv,再 pixel_unshuffle 回低分辨率 offset x_ = F.pixel_shuffle(x, self.scale) alpha = self._get_alpha() offset = ( F.pixel_unshuffle(self.offset(x_), self.scale) * alpha + self.init_pos.to(x.dtype) ) return self.sample(x, offset) def forward(self, x): if self.style == "pl": return self.forward_pl(x) return self.forward_lp(x)
在生成 offset 前,先做
1×1 通道压缩 + 3×3 编码,更关注局部结构。
其它接口完全一样,也没有 gate,没有动态 scope。
pythonclass DySample(nn.Module): """ 版本2:版本1 + DLU 风格内容编码器 - YOLO yaml: DySample, [2, "lp"] - groups = 4 固定 - learnable static scope alpha - offset 之前加 compress + 3x3 encoder """ def __init__(self, in_channels, scale=2, style="lp"): super().__init__() self.scale = scale self.style = style self.groups = 4 assert style in ["lp", "pl"] if style == "pl": assert in_channels >= scale ** 2 and in_channels % (scale ** 2) == 0 assert in_channels >= self.groups and in_channels % self.groups == 0 # DLU 风格内容编码器:1x1 压缩 + 3x3 编码 mid_channels = max(in_channels // 2, 16) self.compress = nn.Conv2d(in_channels, mid_channels, 1) self.encoder = nn.Conv2d( mid_channels, in_channels, kernel_size=3, padding=1, dilation=1, bias=True, ) # offset 卷积输入 if style == "pl": conv_in_channels = in_channels // (scale ** 2) out_channels = 2 * self.groups else: conv_in_channels = in_channels out_channels = 2 * self.groups * (scale ** 2) self.offset = nn.Conv2d(conv_in_channels, out_channels, 1) normal_init(self.offset, std=0.001) # learnable alpha init_alpha = 0.25 logit = math.log(init_alpha / (0.5 - init_alpha)) self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float)) self.register_buffer("init_pos", self._init_pos()) # === 原版 _init_pos 和 sample(保持不动) === def _init_pos(self): h = torch.arange( (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1 ) / self.scale pos = torch.stack(torch.meshgrid([h, h])).transpose(1, 2) pos = pos.repeat(1, self.groups, 1).reshape(1, -1, 1, 1) return pos def sample(self, x, offset): B, _, H, W = offset.shape offset = offset.view(B, 2, -1, H, W) coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5 coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5 coords = torch.stack(torch.meshgrid([coords_w, coords_h])) coords = ( coords.transpose(1, 2) .unsqueeze(1) .unsqueeze(0) .to(device=x.device, dtype=x.dtype) ) normalizer = torch.tensor( [W, H], dtype=x.dtype, device=x.device ).view(1, 2, 1, 1, 1) coords = 2 * (coords + offset) / normalizer - 1 coords = F.pixel_shuffle( coords.view(B, -1, H, W), self.scale ).view( B, 2, -1, self.scale * H, self.scale * W ).permute( 0, 2, 3, 4, 1 ).contiguous().flatten( 0, 1 ) return F.grid_sample( x.reshape(B * self.groups, -1, H, W), coords, mode="bilinear", align_corners=False, padding_mode="border", ).view(B, -1, self.scale * H, self.scale * W) # ========================================== def _get_alpha(self): return 0.5 * torch.sigmoid(self.logit_alpha) def encode(self, x): x1 = self.compress(x) x1 = F.relu(x1, inplace=True) x1 = self.encoder(x1) return x1 def _forward_lp_core(self, x): x_enc = self.encode(x) alpha = self._get_alpha() offset = self.offset(x_enc) * alpha + self.init_pos.to(x.dtype) return self.sample(x, offset) def _forward_pl_core(self, x): x_enc = self.encode(x) x_ = F.pixel_shuffle(x_enc, self.scale) alpha = self._get_alpha() offset = ( F.pixel_unshuffle(self.offset(x_), self.scale) * alpha + self.init_pos.to(x.dtype) ) return self.sample(x, offset) def forward(self, x): if self.style == "pl": return self._forward_pl_core(x) return self._forward_lp_core(x)
输出:
y = y_bilinear + gate * (y_dynamic - y_bilinear)
gate 从低分辨率 x 上算出来,插值到高分辨率。平坦区域更偏向 bilinear,边缘区域更偏向动态采样。
pythonclass DySample(nn.Module): """ 版本3:版本2 + gate (bilinear vs dynamic 自适应融合) - YOLO yaml: DySample, [2, "lp"] - groups = 4 固定 - learnable static scope alpha - DLU 风格内容编码器 - gate: y = y_bi + g * (y_dyn - y_bi) """ def __init__(self, in_channels, scale=2, style="lp"): super().__init__() self.scale = scale self.style = style self.groups = 4 assert style in ["lp", "pl"] if style == "pl": assert in_channels >= scale ** 2 and in_channels % (scale ** 2) == 0 assert in_channels >= self.groups and in_channels % self.groups == 0 # 内容编码器 mid_channels = max(in_channels // 2, 16) self.compress = nn.Conv2d(in_channels, mid_channels, 1) self.encoder = nn.Conv2d( mid_channels, in_channels, kernel_size=3, padding=1, dilation=1, bias=True, ) # offset 卷积 if style == "pl": conv_in_channels = in_channels // (scale ** 2) out_channels = 2 * self.groups else: conv_in_channels = in_channels out_channels = 2 * self.groups * (scale ** 2) self.offset = nn.Conv2d(conv_in_channels, out_channels, 1) normal_init(self.offset, std=0.001) # learnable alpha init_alpha = 0.25 logit = math.log(init_alpha / (0.5 - init_alpha)) self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float)) # gate 分支 hidden = max(in_channels // 4, 8) self.gate_conv1 = nn.Conv2d(in_channels, hidden, 1) self.gate_conv2 = nn.Conv2d(hidden, 1, 1) self.register_buffer("init_pos", self._init_pos()) # === 原版 _init_pos 和 sample === def _init_pos(self): h = torch.arange( (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1 ) / self.scale pos = torch.stack(torch.meshgrid([h, h])).transpose(1, 2) pos = pos.repeat(1, self.groups, 1).reshape(1, -1, 1, 1) return pos def sample(self, x, offset): B, _, H, W = offset.shape offset = offset.view(B, 2, -1, H, W) coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5 coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5 coords = torch.stack(torch.meshgrid([coords_w, coords_h])) coords = ( coords.transpose(1, 2) .unsqueeze(1) .unsqueeze(0) .to(device=x.device, dtype=x.dtype) ) normalizer = torch.tensor( [W, H], dtype=x.dtype, device=x.device ).view(1, 2, 1, 1, 1) coords = 2 * (coords + offset) / normalizer - 1 coords = F.pixel_shuffle( coords.view(B, -1, H, W), self.scale ).view( B, 2, -1, self.scale * H, self.scale * W ).permute( 0, 2, 3, 4, 1 ).contiguous().flatten( 0, 1 ) return F.grid_sample( x.reshape(B * self.groups, -1, H, W), coords, mode="bilinear", align_corners=False, padding_mode="border", ).view(B, -1, self.scale * H, self.scale * W) # ============================== def _get_alpha(self): return 0.5 * torch.sigmoid(self.logit_alpha) def encode(self, x): x1 = self.compress(x) x1 = F.relu(x1, inplace=True) x1 = self.encoder(x1) return x1 def _forward_lp_core(self, x): x_enc = self.encode(x) alpha = self._get_alpha() offset = self.offset(x_enc) * alpha + self.init_pos.to(x.dtype) return self.sample(x, offset) def _forward_pl_core(self, x): x_enc = self.encode(x) x_ = F.pixel_shuffle(x_enc, self.scale) alpha = self._get_alpha() offset = ( F.pixel_unshuffle(self.offset(x_), self.scale) * alpha + self.init_pos.to(x.dtype) ) return self.sample(x, offset) def forward(self, x): # 动态分支 if self.style == "pl": y_dyn = self._forward_pl_core(x) else: y_dyn = self._forward_lp_core(x) # 双线性基线 y_bi = F.interpolate( x, scale_factor=self.scale, mode="bilinear", align_corners=False ) # gate from x: [B,1,H,W] -> 上采样到 [B,1,sH,sW] g = self.gate_conv1(x) g = F.relu(g, inplace=True) g = self.gate_conv2(g) g = torch.sigmoid(g) # [B,1,H,W] g_up = F.interpolate( g, scale_factor=self.scale, mode="bilinear", align_corners=False ) # [B,1,sH,sW] # 残差融合:g=0 → 纯 bilinear,g=1 → 纯 DySample y = y_bi + g_up * (y_dyn - y_bi) return y
lp + groups=4 + dyscope=False 有没有一点提升。如果你换了某个版本之后又出现新的错误(比如 shape 不一致),把那一版的 DySample 全部代码 + 报错栈贴出来,我可以帮你把维度再对一遍(但只要 _init_pos 和 sample 按上面写,基本不会再出现 “2 vs 16” 这种问题了)。
grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij") # (W, H)
coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2) # (2, H, W)
text# 关键修改:让 coords 变成 [1, 2, 1, H, W],方便和 [B, 2, G, H, W] 广播 coords = coords.unsqueeze(0).unsqueeze(2) # (1, 2, 1, H, W)和你给的修改方案有什么区别呢,结果一样吗
是一样的 ✅,你这个写法和我给你的那版,在形状和数值上都等价,不会影响结果。
咱们仔细对比一下这两种写法做了什么:
pythongrid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij") # (W, H) coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2) # (2, H, W) # 关键:变成 [1, 2, 1, H, W] coords = coords.unsqueeze(0).unsqueeze(2) # (1, 2, 1, H, W)
维度变化过程:
grid_w, grid_h: 形状都是 (W, H)stack(..., dim=0) → (2, W, H)transpose(1, 2) → (2, H, W)unsqueeze(0) → (1, 2, H, W)unsqueeze(2) → (1, 2, 1, H, W)最后是 (1, 2, 1, H, W) ✅
pythoncoords = torch.stack(torch.meshgrid([coords_w, coords_h])) # (2, W, H) coords = coords.transpose(1, 2) # (2, H, W) coords = coords.unsqueeze(1).unsqueeze(0) # (1, 2, 1, H, W)
维度变化过程:
stack(meshgrid(...)) → (2, W, H)transpose(1, 2) → (2, H, W)unsqueeze(1) → (2, 1, H, W)unsqueeze(0) → (1, 2, 1, H, W)最后也是 (1, 2, 1, H, W) ✅
(1, 2, 1, H, W),正好可以和 offset 的 (B, 2, G, H, W) 在第 0 和第 2 维广播:
1 vs B → 广播2 vs 2 → 匹配1 vs G → 广播H vs H → 匹配W vs W → 匹配unsqueeze(0).unsqueeze(2) 和 unsqueeze(1).unsqueeze(0) 只是“从不同顺序插入维度”,但最后的维度序列都是 [1, 2, 1, H, W],数据本身没变。meshgrid(..., indexing="ij") 只是修掉 PyTorch 的 warning,这个和我不用 indexing、直接默认 'ij' 在行为上是一致的(前提是维度顺序一样)。所以,如果你现在这一版:
pythongrid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij") coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2) coords = coords.unsqueeze(0).unsqueeze(2)
已经不再报之前那个:
textRuntimeError: The size of tensor a (2) must match the size of tensor b (16) at non-singleton dimension 2
那就可以放心用,和我给你的那版在数学上是一样的,不会影响 DySample 的行为。
如果你之后还想加别的改动(比如 gate 或 encoder),只要保证 coords 最终是 [1, 2, 1, H, W],offset 是 [B, 2, G, H, W],就不会再遇到这个维度冲突问题了。
原始动态上采样lp和False的训练结果,p,r,map50,map50-95如下:0.947, 0.882,0.901,0.72,按照方案2的结果最差,不展示,方案1结果如下:0.94,0.83,0.909,0.725,方案3结果如下:0.961,0.836,0.898,0.721,我现在应该选择哪个方案继续修改呢
结论先说在前面:后续主线建议基于「方案 1」继续改,方案 3 保留当对比 / 做 P–R trade-off 用就好。
你现在的几个结果(我给它们起个名字方便说):
🟢 原版 DySample(lp, groups=4, dyscope=False)
🔵 方案 1(static α 可学习)
🟥 方案 3(encoder + gate)
可以看到:
在 COCO 等检测任务里,评价最终看的是 mAP50–95 为主,P/R 更像是“侧面性格”。
从这个角度看:
结合 DySample 论文本身的设计和消融实验:
你现在的方案 1 做的事情是:
0.25 外面加了一个 可学习 α,让网络自己在 ~0.25 附近微调从实验结果来看:
相比之下:
所以我的建议是:
✅ 继续基于「方案 1」做细化和小改进,把它当作主干版本。
⭕ 方案 3 可以保留用来研究 “高 P / 低 R” 风格,或者以后做 ensemble / 测试不同任务时再启用。
你后面如果要继续改方案 1,可以重点围绕几块玩——都在 不改 yaml 接口 的前提下:
约束 α 不要跑太远
(0, 0.5),可以:
(α - 0.25)^2 加一个很小的正则(0.15, 0.35),让它只在理论合适区间内细调只对某一层 DySample 用 encoder / gate 做局部增强(比如只在高层特征上用)
针对召回下降的问题,微调 NMS / conf 阈值
如果你接下来想继续改代码,我们可以在 方案 1 的实现上,一点点加这些小改动,而不动 yaml。
一句话总结:
按照你说的方案给出可执行的代码
好,那我们就把“方案 1”(可学习静态 scope 的 DySample)整理成一套能直接跑的代码给你 👍
你只需要改两块:
DySample 类(比如放在 ultralytics/nn/modules/block.py 里)parse_model 里 DySample 的那段解析逻辑yaml 保持不变:
yamlhead: - [-1, 1, DySample, [2, "lp"]] - [[-1, 6], 1, Concat, [1]] - [-1, 2, C3k2, [512, False]] - [-1, 1, DySample, [2, "lp"]] - [[-1, 4], 1, Concat, [1]] - [-1, 2, C3k2, [256, False]]
放在你原来定义 DySample 的地方(比如
ultralytics/nn/modules/block.py),
保证工程里同名类DySample只有这一个版本。
pythonimport math import torch import torch.nn as nn import torch.nn.functional as F def normal_init(module, mean=0, std=1, bias=0): if hasattr(module, "weight") and module.weight is not None: nn.init.normal_(module.weight, mean, std) if hasattr(module, "bias") and module.bias is not None: nn.init.constant_(module.bias, bias) def constant_init(module, val, bias=0): if hasattr(module, "weight") and module.weight is not None: nn.init.constant_(module.weight, val) if hasattr(module, "bias") and module.bias is not None: nn.init.constant_(module.bias, bias) class DySample(nn.Module): """ 方案1:LP/PL + 可学习静态 scope(无 scope 卷积) - YOLO yaml: DySample, [2, "lp"] - groups 固定为 4 - alpha ∈ (0, 0.5) 可学习,初始化为 0.25 """ def __init__(self, in_channels, scale=2, style="lp"): super().__init__() self.scale = scale self.style = style self.groups = 4 # 固定 4 组 assert style in ["lp", "pl"] if style == "pl": # PL 风格要求 in_channels 可被 scale^2 整除 assert in_channels >= scale ** 2 and in_channels % (scale ** 2) == 0 # 分组也要整除 assert in_channels >= self.groups and in_channels % self.groups == 0 # 决定 offset 卷积的输入输出通道 if style == "pl": # PL 风格:conv 输入通道 = C / s^2,输出通道 = 2 * groups conv_in_channels = in_channels // (scale ** 2) out_channels = 2 * self.groups else: # LP 风格:conv 输入通道 = C,输出通道 = 2 * groups * s^2 conv_in_channels = in_channels out_channels = 2 * self.groups * (scale ** 2) # offset 卷积 self.offset = nn.Conv2d(conv_in_channels, out_channels, 1) normal_init(self.offset, std=0.001) # 可学习的静态 scope 因子 alpha ∈ (0, 0.5),初始化成 0.25 init_alpha = 0.25 logit = math.log(init_alpha / (0.5 - init_alpha)) # 反 sigmoid 映射 self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float)) # 初始采样位置(论文原版实现) self.register_buffer("init_pos", self._init_pos()) # ------------ 下面两个函数保持官方实现的形状逻辑 ------------ def _init_pos(self): """ 返回 init_pos: [1, 2 * groups * scale^2, 1, 1] """ # h: [-(s-1)/2, ..., (s-1)/2] / s h = torch.arange( (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1 ) / self.scale # (s,) # meshgrid: (s, s),加 indexing="ij" 避免警告 yy, xx = torch.meshgrid(h, h, indexing="ij") # (s, s) # stack 后 (2, s, s),然后在 dim=1 上 repeat groups 次 pos = torch.stack([xx, yy], dim=0) # (2, s, s) pos = pos.transpose(1, 2).repeat(1, self.groups, 1) # (2, s*groups, s) pos = pos.reshape(1, -1, 1, 1) # (1, 2 * groups * s^2, 1, 1) return pos def sample(self, x, offset): """ x: [B, C, H, W] offset: [B, 2*groups*scale^2, H, W] """ B, _, H, W = offset.shape offset = offset.view(B, 2, -1, H, W) # [B, 2, G, H, W], G = groups * scale^2 # 构造基础坐标 [1, 2, 1, H, W] coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5 coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5 grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij") # (W, H) coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2) # (2, H, W) coords = coords.unsqueeze(0).unsqueeze(2) # (1, 2, 1, H, W) # 归一化系数 normalizer = torch.tensor( [W, H], dtype=x.dtype, device=x.device ).view(1, 2, 1, 1, 1) # coords 和 offset 广播相加,结果形状 [B, 2, G, H, W] coords = 2 * (coords + offset) / normalizer - 1 # 展开到高分辨率采样点: # 先 view 成 [B, 2*G, H, W],再 pixel_shuffle(r=scale),得到 [B, 2*G/r^2, sH, sW] = [B, 2*groups, sH, sW] coords = F.pixel_shuffle( coords.view(B, -1, H, W), self.scale ).view( B, 2, -1, self.scale * H, self.scale * W ).permute( 0, 2, 3, 4, 1 ).contiguous().flatten( 0, 1 ) # [B*groups, sH, sW, 2] # 对每个 group 做 grid_sample return F.grid_sample( x.reshape(B * self.groups, -1, H, W), coords, mode="bilinear", align_corners=False, padding_mode="border", ).view(B, -1, self.scale * H, self.scale * W) # -------------------------------------------------- def _get_alpha(self): """ alpha ∈ (0, 0.5) """ return 0.5 * torch.sigmoid(self.logit_alpha) def forward_lp(self, x): """ LP 风格:offset = Conv(x),直接在低分辨率上算 offset """ alpha = self._get_alpha() offset = self.offset(x) * alpha + self.init_pos.to(x.dtype) return self.sample(x, offset) def forward_pl(self, x): """ PL 风格:先 pixel_shuffle 到高分辨率,再 conv,再 pixel_unshuffle 回低分辨率 offset """ x_ = F.pixel_shuffle(x, self.scale) alpha = self._get_alpha() offset = ( F.pixel_unshuffle(self.offset(x_), self.scale) * alpha + self.init_pos.to(x.dtype) ) return self.sample(x, offset) def forward(self, x): if self.style == "pl": return self.forward_pl(x) return self.forward_lp(x)
parse_model 里 DySample 的解析代码关键点有两个:
base_modules,否则会走通用卷积那套逻辑,导致 c2 / args 搞错。[in_channels, scale, style] 这三个参数。base_modules 里删掉 DySample你原来是这样的(节选):
pythonbase_modules = frozenset( { Classify, Conv, ... LSKBlock, DySample, # ❌ 不要放这里 MSCA, } )
改成:
pythonbase_modules = frozenset( { Classify, Conv, CBAM, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, C2fPSA, C2PSA, DWConv, Focus, BottleneckCSP, C1, C2, C2f, C3k2, RepNCSPELAN4, ELAN1, ADown, AConv, SPPELAN, C2fAttn, C3, C3TR, C3Ghost, torch.nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3, PSA, SCDown, C2fCIB, A2C2f, LSKBlock, MSCA, # ⬆️ 这里不要写 DySample } )
在你原来的 for i, (f, n, m, args) in enumerate(...): 大循环里,
留意你之前的结构,大概是这样(节选):
pythonfor i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]): m = ... ... if m in base_modules: ... elif m is AIFI: ... ... elif m in frozenset({TorchVision, Index}): ... elif m is DySample: c2 = ch[f] args = [ch[f], *args] else: c2 = ch[f]
我们把 DySample 分支改成带 scale/style 解析的版本:
python... elif m in frozenset({TorchVision, Index}): c2 = args[0] c1 = ch[f] args = [*args[1:]] elif m is DySample: # DySample 不改变通道数:c2 = c1 c1 = ch[f] c2 = c1 # yaml 给的是 [2, "lp"],只解析这两个参数 scale = args[0] if len(args) > 0 else 2 style = args[1] if len(args) > 1 else "lp" # DySample(in_channels, scale, style) args = [c1, scale, style] else: c2 = ch[f]
这样:
[2, "lp"] 会被扩充成 [in_channels, 2, "lp"]DySample.__init__(self, in_channels, scale=2, style='lp') 完全对应c2 会被正确记成 ch[f],后面 Concat/Detect 都能正常推断通道数DySample 替换成上面“方案 1”的这个版本。parse_model 里:
base_modules 没有 DySample;elif m is DySample: 的分支。这样启动 python train.py 时:
2 vs 16 的 shape 报错;lp, groups=4, dyscope=False 的基础上,mAP50 和 mAP50–95 都有小提升。如果你按这个版本跑完,又想在这个基础上加小改动(比如约束 α 的范围、只给某一层上 encoder/gate),可以把新的报错或结果再贴出来,我们可以继续在这个“方案 1”版本上迭代。