这是我阅读的一个关于动态上采样的文章，这个文章里介绍了动态上采样的方法和操作细节，这两个动态上采样哪...

Question

这是我阅读的一个关于动态上采样的文章，这个文章里介绍了动态上采样的方法和操作细节，这两个动态上采样哪个参数量小和效果好呢，哪个模型更新呢，我想将这个动态上采样插入在Ultralytics yolov11的模型里来替换原始的yolov11的双线性上采样，理解吗，我尝试了DySample的静态和动态两个模型，发现静态对指标map50-95是下降，但是其他指标都是上升，他的代码如下：def normal_init(module, mean=0, std=1, bias=0):
if hasattr(module, 'weight') and module.weight is not None:
nn.init.normal_(module.weight, mean, std)
if hasattr(module, 'bias') and module.bias is not None:
nn.init.constant_(module.bias, bias)

def constant_init(module, val, bias=0):
if hasattr(module, 'weight') and module.weight is not None:
nn.init.constant_(module.weight, val)
if hasattr(module, 'bias') and module.bias is not None:
nn.init.constant_(module.bias, bias)

class DySample(nn.Module):
def init(self, in_channels, scale=2, style='lp', groups=4, dyscope=False):
super().init()
self.scale = scale
self.style = style
self.groups = groups
assert style in ['lp', 'pl']
if style == 'pl':
assert in_channels >= scale ** 2 and in_channels % scale ** 2 == 0
assert in_channels >= groups and in_channels % groups == 0

text
    if style == 'pl':
        in_channels = in_channels // scale ** 2
        out_channels = 2 * groups
    else:
        out_channels = 2 * groups * scale ** 2

    self.offset = nn.Conv2d(in_channels, out_channels, 1)
    normal_init(self.offset, std=0.001)
    if dyscope:
        self.scope = nn.Conv2d(in_channels, out_channels, 1, bias=False)
        constant_init(self.scope, val=0.)

    self.register_buffer('init_pos', self._init_pos())

def _init_pos(self):
    h = torch.arange((-self.scale + 1) / 2, (self.scale - 1) / 2 + 1) / self.scale
    return torch.stack(torch.meshgrid([h, h])).transpose(1, 2).repeat(1, self.groups, 1).reshape(1, -1, 1, 1)

def sample(self, x, offset):
    B, _, H, W = offset.shape
    offset = offset.view(B, 2, -1, H, W)
    coords_h = torch.arange(H) + 0.5
    coords_w = torch.arange(W) + 0.5
    coords = torch.stack(torch.meshgrid([coords_w, coords_h])
                         ).transpose(1, 2).unsqueeze(1).unsqueeze(0).type(x.dtype).to(x.device)
    normalizer = torch.tensor([W, H], dtype=x.dtype, device=x.device).view(1, 2, 1, 1, 1)
    coords = 2 * (coords + offset) / normalizer - 1
    coords = F.pixel_shuffle(coords.view(B, -1, H, W), self.scale).view(
        B, 2, -1, self.scale * H, self.scale * W).permute(0, 2, 3, 4, 1).contiguous().flatten(0, 1)
    return F.grid_sample(x.reshape(B * self.groups, -1, H, W), coords, mode='bilinear',
                         align_corners=False, padding_mode="border").view(B, -1, self.scale * H, self.scale * W)

def forward_lp(self, x):
    if hasattr(self, 'scope'):
        offset = self.offset(x) * self.scope(x).sigmoid() * 0.5 + self.init_pos
    else:
        offset = self.offset(x) * 0.25 + self.init_pos
    return self.sample(x, offset)

def forward_pl(self, x):
    x_ = F.pixel_shuffle(x, self.scale)
    if hasattr(self, 'scope'):
        offset = F.pixel_unshuffle(self.offset(x_) * self.scope(x_).sigmoid(), self.scale) * 0.5 + self.init_pos
    else:
        offset = F.pixel_unshuffle(self.offset(x_), self.scale) * 0.25 + self.init_pos
    return self.sample(x, offset)

def forward(self, x):
    if self.style == 'pl':
        return self.forward_pl(x)
    return self.forward_lp(x)我找到的DLU代码是不是是下面呢，我应该复制哪段去注册和替换呢，yaml文件能直接插入吗：class CARAFENaiveFunction(Function):

@staticmethod
def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
    return g.op(
        'MMCVCARAFENaive',
        features,
        masks,
        kernel_size=kernel_size,
        group_size=group_size,
        scale_factor=scale_factor)

@staticmethod
def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
    assert scale_factor >= 1
    assert masks.size(1) == kernel_size * kernel_size * group_size
    assert masks.size(-1) == features.size(-1) * scale_factor
    assert masks.size(-2) == features.size(-2) * scale_factor
    assert features.size(1) % group_size == 0
    assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
    ctx.kernel_size = kernel_size
    ctx.group_size = group_size
    ctx.scale_factor = scale_factor
    ctx.feature_size = features.size()
    ctx.mask_size = masks.size()

    n, c, h, w = features.size()
    output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
    ext_module.carafe_naive_forward(
        features,
        masks,
        output,
        kernel_size=kernel_size,
        group_size=group_size,
        scale_factor=scale_factor)

    if features.requires_grad or masks.requires_grad:
        ctx.save_for_backward(features, masks)
    return output

@staticmethod
def backward(ctx, grad_output):
    assert grad_output.is_cuda

    features, masks = ctx.saved_tensors
    kernel_size = ctx.kernel_size
    group_size = ctx.group_size
    scale_factor = ctx.scale_factor

    grad_input = torch.zeros_like(features)
    grad_masks = torch.zeros_like(masks)
    ext_module.carafe_naive_backward(
        grad_output.contiguous(),
        features,
        masks,
        grad_input,
        grad_masks,
        kernel_size=kernel_size,
        group_size=group_size,
        scale_factor=scale_factor)

    return grad_input, grad_masks, None, None, None

carafe_naive = CARAFENaiveFunction.apply

class CARAFENaive(Module):

text
def __init__(self, kernel_size, group_size, scale_factor):
    super(CARAFENaive, self).__init__()

    assert isinstance(kernel_size, int) and isinstance(
        group_size, int) and isinstance(scale_factor, int)
    self.kernel_size = kernel_size
    self.group_size = group_size
    self.scale_factor = scale_factor

def forward(self, features, masks):
    return carafe_naive(features, masks, self.kernel_size, self.group_size,
                        self.scale_factor)

class CARAFEFunction(Function):

text
@staticmethod
def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
    return g.op(
        'MMCVCARAFE',
        features,
        masks,
        kernel_size=kernel_size,
        group_size=group_size,
        scale_factor=scale_factor)

@staticmethod
def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
    assert scale_factor >= 1
    assert masks.size(1) == kernel_size * kernel_size * group_size
    assert masks.size(-1) == features.size(-1) * scale_factor
    assert masks.size(-2) == features.size(-2) * scale_factor
    assert features.size(1) % group_size == 0
    assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
    ctx.kernel_size = kernel_size
    ctx.group_size = group_size
    ctx.scale_factor = scale_factor
    ctx.feature_size = features.size()
    ctx.mask_size = masks.size()

    n, c, h, w = features.size()
    output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
    routput = features.new_zeros(output.size(), requires_grad=False)
    rfeatures = features.new_zeros(features.size(), requires_grad=False)
    rmasks = masks.new_zeros(masks.size(), requires_grad=False)
    ext_module.carafe_forward(
        features,
        masks,
        rfeatures,
        routput,
        rmasks,
        output,
        kernel_size=kernel_size,
        group_size=group_size,
        scale_factor=scale_factor)

    if features.requires_grad or masks.requires_grad:
        ctx.save_for_backward(features, masks, rfeatures)
    return output

@staticmethod
def backward(ctx, grad_output):
    assert grad_output.is_cuda

    features, masks, rfeatures = ctx.saved_tensors
    kernel_size = ctx.kernel_size
    group_size = ctx.group_size
    scale_factor = ctx.scale_factor

    rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
    rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
    rgrad_input = torch.zeros_like(features, requires_grad=False)
    rgrad_masks = torch.zeros_like(masks, requires_grad=False)
    grad_input = torch.zeros_like(features, requires_grad=False)
    grad_masks = torch.zeros_like(masks, requires_grad=False)
    ext_module.carafe_backward(
        grad_output.contiguous(),
        rfeatures,
        masks,
        rgrad_output,
        rgrad_input_hs,
        rgrad_input,
        rgrad_masks,
        grad_input,
        grad_masks,
        kernel_size=kernel_size,
        group_size=group_size,
        scale_factor=scale_factor)
    return grad_input, grad_masks, None, None, None

carafe = CARAFEFunction.apply

class CARAFE(Module):
""" CARAFE: Content-Aware ReAssembly of FEatures

text
Please refer to https://arxiv.org/abs/1905.02188 for more details.

Args:
    kernel_size (int): reassemble kernel size
    group_size (int): reassemble group size
    scale_factor (int): upsample ratio

Returns:
    upsampled feature map
"""

def __init__(self, kernel_size, group_size, scale_factor):
    super(CARAFE, self).__init__()

    assert isinstance(kernel_size, int) and isinstance(
        group_size, int) and isinstance(scale_factor, int)
    self.kernel_size = kernel_size
    self.group_size = group_size
    self.scale_factor = scale_factor

def forward(self, features, masks):
    return carafe(features, masks, self.kernel_size, self.group_size,
                  self.scale_factor)

@MODELS.register_module(name='dlu')
class DLUPack(nn.Module):
"""
Args:
channels (int): input feature channels
scale_factor (int): upsample ratio
up_kernel (int): kernel size of CARAFE op
up_group (int): group size of CARAFE op
encoder_kernel (int): kernel size of content encoder
encoder_dilation (int): dilation of content encoder
compressed_channels (int): output channels of channels compressor

text
Returns:
    upsampled feature map
"""

def __init__(self,
             channels,
             scale_factor,
             up_kernel=5,
             up_group=1,
             encoder_kernel=3,
             encoder_dilation=1,
             compressed_channels=64):
    super(DLUPack, self).__init__()
    self.channels = channels
    self.scale_factor = scale_factor
    self.up_kernel = up_kernel
    self.up_group = up_group
    self.encoder_kernel = encoder_kernel
    self.encoder_dilation = encoder_dilation
    self.compressed_channels = compressed_channels
    self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
                                        1)
    self.kernel_space_generator = nn.Conv2d(
        self.compressed_channels,
        self.up_kernel * self.up_kernel * self.up_group,
        self.encoder_kernel,
        padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
        dilation=self.encoder_dilation,
        groups=1)
    self.conv_offset = nn.Conv2d(
        self.compressed_channels,
        self.up_group * 2 * self.scale_factor * self.scale_factor,
        self.encoder_kernel,
        padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
        dilation=self.encoder_dilation,
        bias=True)
    self.init_weights()

def init_weights(self):
    for m in self.modules():
        if isinstance(m, nn.Conv2d):
            xavier_init(m, distribution='uniform')
    normal_init(self.kernel_space_generator, std=0.001)
    self.conv_offset.weight.data.zero_()
    self.conv_offset.bias.data.zero_()


def kernel_space_normalizer(self, mask):
    n, mask_c, h, w = mask.size()
    # use float division explicitly,
    # to void inconsistency while exporting to onnx
    mask_channel = int(mask_c / float(self.up_kernel**2))
    mask = mask.view(n, mask_channel, -1, h, w)

    mask = F.softmax(mask, dim=2)
    mask = mask.view(n, mask_c, h, w).contiguous()        
    return mask

def kernel_space_expander(self, offset, mask):
    n, _, h, w = offset.size()
    offset = F.pixel_shuffle(offset, self.scale_factor)
    offset = offset.permute(0,2,3,1)
    offset[:,:,:,0] = offset[:,:,:,0] * 1/(w-1)*2
    offset[:,:,:,1] = offset[:,:,:,1] * 1/(h-1)*2

    new_h = torch.repeat_interleave(torch.linspace(-1, 1, h),self.scale_factor).view(-1, 1).repeat(1, self.scale_factor*w)
    new_w = torch.repeat_interleave(torch.linspace(-1, 1, w),self.scale_factor).repeat(self.scale_factor*h, 1)

    grid = torch.cat((new_w.unsqueeze(2), new_h.unsqueeze(2)), dim=2)
    grid = grid.unsqueeze(0)
    grid_ = grid.expand(n,-1,-1,-1)  
    grid_ = grid_.to(device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    offset = grid_ + offset
    mask_ = F.grid_sample(mask, offset,padding_mode='border',align_corners=True)     
    return mask_

def feature_reassemble(self, x, mask):
    x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
    return x

def forward(self, x):
    compressed_x = self.channel_compressor(x)
    offset = self.conv_offset(compressed_x)
    mask = self.kernel_space_generator(compressed_x)
    mask = self.kernel_space_normalizer(mask)

    mask_ = self.kernel_space_expander(offset, mask)

    x = self.feature_reassemble(x, mask_)
    return x

Answer 1

先直接回答你几个关键点，然后再说怎么在 YOLO 里落地：

1. DySample vs DLU：谁更轻，谁效果更好，谁更新？

1.1 参数量 & 复杂度（以 2× 上采样、C=256 为例）

DySample（你贴的这个实现）

style='lp'、groups=4、scale=2 时：
- out_channels = 2 * groups * scale^2 = 2*4*4 = 32
- offset 这一层是 Conv2d(256 → 32, k=1)，参数量 ≈ 256×32 = 8,192
- 如果用了 dyscope=True，再加一个同规格 scope 卷积，再 +8,192
所以：
- 静态版（无 scope）：≈ 8K 参数 / 每个上采样点
- 动态版（有 scope）：≈ 16K 参数 / 每个上采样点

完全是纯 PyTorch：conv + pixel_shuffle + grid_sample，不需要自定义 CUDA。(Learning to Upsample by Learning to Sample.pdf)

DLU（DLUPack）

DLU 的 kernel-generation 有 3 个主要卷积：(DLU.pdf)

channel_compressor: Conv2d(C → Cm, 1x1)
kernel_space_generator: Conv2d(Cm → kup^2 * up_group, k=encoder_kernel)
conv_offset: Conv2d(Cm → up_group * 2 * scale^2, k=encoder_kernel)

论文建议配置：Cm=64, kup=5, encoder_kernel=3, up_group=1, scale=2。
那大概是：

channel_compressor: 256×64 ≈ 16K
kernel_space_generator: 64×25×3×3 ≈ 14.4K
conv_offset: 64×8×3×3 ≈ 4.6K

合计 ≈ 35K 参数 / 每个上采样层，和论文 Table 2 里 “per op 35K params” 一致。(DLU.pdf)

结论：

在你这种典型 “2× FPN 上采样、C≈256” 场景下：
DySample < DLU < CARAFE（参数和 FLOPs 都是这个量级关系）。

如果是 4×、8×、16× 这类大尺度上采样，DLU 相对 CARAFE 的优势会越来越明显，但和 DySample 谁更轻，还是要看具体配置。整体来看，DySample 仍然是最“抠门”的那个。

1.2 性能（从论文层面看）

DLU 论文里（在 FPN 检测上）：(DLU.pdf)

Baseline 最近邻：mAP ≈ 37.4
CARAFE：mAP ≈ 38.6
DLU：mAP ≈ 38.6（和 CARAFE 持平），但参数从 74K → 35K，FLOPs 也更少。

在 Libra R-CNN、Semantic FPN 等任务中，DLU 通常比 CARAFE 略好一点点，同时参数大幅减少，尤其是 8×、16× 这种大上采样时差距很夸张。(DLU.pdf)

DySample 论文里（在 SegFormer / Faster RCNN 等）：(Learning to Upsample by Learning to Sample.pdf)

Semantic segmentation（SegFormer-B1, ADE20K）：
- 双线性：41.68 mIoU
- CARAFE：42.82 mIoU
- DySample-S+：43.58 mIoU，且只多了 1~10K 级别 的参数。
Faster R-CNN（COCO）：
- 最近邻：37.5 AP
- CARAFE：38.6 AP
- DySample+：38.7 AP，同样额外参数很小。

整体感觉：

两者都比双线性稳定提升 1pt 以上。
DySample 在多个任务上整体略优于 CARAFE，DLU 在自己那篇论文里也基本和/略优 CARAFE。
直接横向比较（不同论文、不同 baseline）是不严格的，但至少能说明：
- 两者都是“在不太增加复杂度的前提下，能带来稳定收益”的动态上采样。
- DySample 在轻量和好用程度上更“插件化”、更适合你这样自己往 YOLO 里塞。

1.3 哪个更“新”？

DySample：2023 年 ICCV 论文。(Learning to Upsample by Learning to Sample.pdf)
DLU：2024 年的工作（Lighten CARAFE: DLU）。(DLU.pdf)

时间上 DLU 更新，但“更新 ≠ 一定更好”，尤其你目前只需要 2× 上采样、又是 YOLO 这种实时模型，DySample 其实更合适、更容易落地。

2. 你说的 DySample 静态/动态模型现象怎么理解？

你实现的这个 DySample 其实正是论文里说的 “static scope factor / dynamic scope factor” 两种版本：(Learning to Upsample by Learning to Sample.pdf)

静态版：offset = self.offset(x) * 0.25 + self.init_pos
动态版：offset = self.offset(x) * self.scope(x).sigmoid() * 0.5 + self.init_pos

静态版直接把 offset 范围缩放在一个固定区间（理论上能减轻采样重叠、减少边界伪影），动态版会再乘一个由特征算出来的 gate（scope），更灵活，但也更复杂一点。

你观测到：

静态版：mAP50-95 略降，但是其他指标都升

可能原因大致是：

整体召回/粗粒度定位变好了（比如 mAP50、mAP small、mAP large 之类），所以其他指标上升；
但由于采样点的偏移 + FPN 多次上采样叠加，对 高 IoU 阈值（0.75~0.95）的位置精度 反而有一点点破坏，所以 AP50-95 的平均值略降；
动态版 scope 更灵活，理论上更有能力在“边界清晰”和“内部一致性”之间取得平衡，论文里的实验也是动态版略优一点。(Learning to Upsample by Learning to Sample.pdf)

实践建议：

如果你本地实验中：动态版的 mAP50-95 ≥ 基线或 ≥ 静态版，那就优先用 动态版（dyscope=True）。
再微调：groups（4 / 8）、style='lp'/'pl'，这些在论文里也有做 ablation（一般分组数 4 或 8 是一个不错的折中）。(Learning to Upsample by Learning to Sample.pdf)

3. 你贴的 DLU 代码是不是对的？yaml 能不能直接插入？

3.1 这是不是 DLU 的代码？

是的，这段确实就是 DLU 官方框架里的核心代码结构：(DLU.pdf)

CARAFENaiveFunction / CARAFEFunction / CARAFE / carafe
- 负责封装 CARAFE 的前向 & 反向，里面用到了 ext_module.carafe_forward/backward —— 这是 mmcv/mmdet 里编译好的 CUDA 扩展。
DLUPack(nn.Module)
- 这里才是真正的 DLU 上采样模块：
  - channel_compressor
  - kernel_space_generator
  - conv_offset
  - kernel_space_normalizer
  - kernel_space_expander （里面用 grid_sample 做 GUIDED 采样）
  - feature_reassemble 里调用 carafe(x, mask_, ...) 完成特征重组

要在 YOLO 里用 DLU，最小闭包至少需要：

CARAFEFunction / carafe （以及依赖的 CUDA 扩展）

DLUPack

光复制 DLUPack 一段是跑不动的。

3.2 yaml 能不能直接插入这段代码？

不能。

Ultralytics 的 yaml 只能写：

yaml
- [from, number, ModuleName, [args...]]

这里的 ModuleName 必须是 已经在 Python 代码里定义并注册好的类/函数名，yaml 里是不能直接粘贴 class DLUPack(nn.Module): ... 这种代码的。

所以，正确流程是：

在 Ultralytics 项目里新建一个 .py 文件，比如 ultralytics/nn/modules/dlu.py，把 CARAFEFunction、carafe、DLUPack 这些 class/函数放进去。
确保 ext_module 对应的 C++/CUDA 扩展能正常 import（例如 mmcv 已经安装并编译带 carafe 的版本）；
在 ultralytics/nn/modules/__init__.py 里 from .dlu import DLUPack，并加入到 __all__；
最后在模型 yaml 里用类似：
```
yaml
- [-1, 1, DLUPack, [channels, 2, 5, 1, 3, 1, 64]]
```
这种形式调用（具体参数按 DLUPack 的 init 来填）。

实际上，由于 需要编译 mmcv 的 carafe CUDA 扩展，这一整套流程对 Ultralytics 这种纯 PyTorch 工程来说，集成成本很高，不像 DySample 那么“拎包即用”。

4. 建议：在 YOLOv11 里怎么优雅地替换上采样（以 DySample 为例）

下面给一套可操作的思路，你可以对照 Ultralytics 的代码结构稍微改一下（YOLOv8/9/11 的结构差不多）：

4.1 在代码里定义模块

新建文件，例如：ultralytics/nn/modules/dysample.py，把你贴的 DySample 类放进去，然后再加一个包装层，专门做 FPN 的上采样：

python
# ulralytics/nn/modules/dysample.py
import torch
import torch.nn as nn
import torch.nn.functional as F

# 你原来的 DySample 类，略...

class DySampleUp(nn.Module):
    """
    YOLO FPN 专用的 DySample 上采样模块
    c1: 输入通道
    scale: 上采样倍率 (一般是 2)
    这里假设上采样前后通道数不变
    """
    def __init__(self, c1, scale=2, style='lp', groups=4, dyscope=True):
        super().__init__()
        self.dysample = DySample(
            in_channels=c1,
            scale=scale,
            style=style,
            groups=groups,
            dyscope=dyscope
        )

    def forward(self, x):
        return self.dysample(x)

注意：

确保 groups 能整除 in_channels，否则 assert in_channels % groups == 0 会炸。

如果你有某些层通道比较小（比如 64），就把 groups 改小一点（1 或 2）。

然后在 ultralytics/nn/modules/__init__.py 里加入：

python
from .dysample import DySampleUp

__all__ = [
    # ...
    'DySampleUp',
]

4.2 在 yaml 里替换原来的双线性上采样

找一找你 yolov11-*.yaml 里类似这样的层（FPN/Path Aggregation 那几层）：

yaml
# 伪代码形式，大概类似
- [-1, 1, nn.Upsample, [None, 2, 'bilinear']]

或者在 Python 版模型定义里看到：

python
self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)

你就把这些地方替换成我们刚才注册的模块，比如 yaml 里改成：

yaml
# 假设某个尺度的特征通道数是 256
- [-1, 1, DySampleUp, [256, 2, 'lp', 4, True]]

然后把 DySampleUp.__init__ 的签名改成配合 yaml：

python
class DySampleUp(nn.Module):
    def __init__(self, c1, scale=2, style='lp', groups=4, dyscope=True):
        ...

Ultralytics 构建模型时会自动把 c1 设置成上一层输出通道数。

4.3 训练上的注意点

更换了模块后，对应权重 shape 变了，不能直接加载旧的上采样层权重，这几层需要重新随机初始化；
如果你是从官方的 YOLOv11 预训练权重开始 finetune，建议：
- 只加载 backbone & 一部分 neck 的权重；
- 新的 DySample 层保持默认 init（论文里 offset conv 用较小 std 的 normal_init 是对的）；
- 适当降低初始学习率，或者用更长一点的 warmup，避免一开始 offset 抖太厉害。
训练完后重点看：
- mAP50 vs mAP50-95
- 中小目标（mAPs/mAPm）是否有提升

5. 如果你“非要”上 DLU，要注意什么？

纯技术上是可行的，但门槛比 DySample 高很多：

你需要一个能用的 carafe CUDA op：
- 通常是安装 mmcv-full（要和 PyTorch/CUDA 版本对得上）；
DLU 的那大段代码基本都要搬进来：
- CARAFEFunction / carafe
- DLUPack
在 Ultralytics 里注册 DLUPack 类，然后在 yaml 里用类似：
```
yaml
- [-1, 1, DLUPack, [channels, 2, 5, 1, 3, 1, 64]]
```
对应 __init__(channels, scale_factor, up_kernel, up_group, encoder_kernel, encoder_dilation, compressed_channels)。

如果你对 C++/CUDA 扩展编译不是很熟、又主要是做 YOLO 检测实验，我真心建议：

优先用 DySample（动态 scope 版），DLU 可以等之后有时间再慢慢折腾。

一句话总结给你：

参数量：DySample（特别是 S/S+ 版本）最少，DLU 比 CARAFE 轻但还是比 DySample 重一点。
表现：两者都显著优于双线性；从各自论文看，DySample 在多任务上整体更强一点。
新旧：DLU 比 DySample 新，但对 YOLO 这种 2× 上采样、实时模型来说，DySample 更适合作为“bilinear 的直接替代品”。
落地成本：DySample 纯 PyTorch，直接塞进 YOLO 代码就能用；DLU 需要 CARAFE 的 CUDA 扩展，工程成本明显更高。

如果你愿意，可以把你当前的 yolov11.yaml 和你改后的 DySample 代码也贴一份，我可以帮你具体看看哪几个 upsample 层最适合替换、参数怎么配比较稳。

Answer 2

先说结论，然后一点点对照你贴的代码排错：

一、两个报错分别是什么原因？

1）`TypeError: '<' not supported between instances of 'int' and 'NoneType'`

报错位置：

python
c2 = make_divisible(min(c2, max_channels) * width, 8)

这行只会在 if m in base_modules: 里面执行。
你又在 base_modules 里加了 DySample：

python
base_modules = frozenset(
    {
        ...
        LSKBlock,
        DySample,   # ← 你加的
        MSCA,
    }
)

但 DySample 的第一个参数不是 out_channels，而是 scale，而且它本身也不该走“卷积类模块”的这套逻辑。

更关键的是：有些模块（比如原来的 nn.Upsample）在 yaml 里写的是 [None, 2, 'nearest']，
args[0] 就是 None，一旦这个模块被错误地当成 base_modules 去处理，就会变成：

python
c1, c2 = ch[f], args[0]  # c2 = None
c2 = make_divisible(min(c2, max_channels) * width, 8)
# -> min(None, 1024) : int 和 None 比较，直接 TypeError

所以第一次那个 TypeError，本质就是：

把 DySample（以及类似的模块）错误地丢进了 base_modules，
让解析器以为 args[0] 是“输出通道”，去做 min(c2, max_channels)，结果 c2 里有 None。

2）`IndexError: list index out of range`（出现在 Detect 上）

报错位置：

python
elif m in frozenset({Detect, WorldDetect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB, ImagePoolingAttn, v10Detect}):
    args.append([ch[x] for x in f])

你 yaml 里 Detect 这一行是：

yaml
- [[17, 20, 23], 1, Detect, [nc]] # Detect(P3, P4, P5)

parse_model 在 构建第 i 层时，ch 里只存放了 0..(i-1) 层的输出通道数。
对 Detect 这一行来说，它自己的索引是 23，所以此时 ch 只有 index 0..22。

但你写的 from = [17, 20, 23]：

17 → OK
20 → OK
23 → 还没构建到 Detect 本身，ch[23] 越界 → IndexError

同时，从结构上看，你真正要拿来做检测的 P3/P4/P5 的输出层，还是：

P3：第 16 层
P4：第 19 层
P5：第 22 层

也就是说，这一行 根本不应该改成 [17, 20, 23]，应该保持原始的：

yaml
- [[16, 19, 22], 1, Detect, [nc]] # Detect(P3, P4, P5)

你把 Detect 这行“跟着一起 +1”了，就直接踩到 ch 越界。

二、DySample 在 parse_model 里的正确写法

你现在的这段是：

python
elif m is DySample:
    c2=ch[f]
    args=[ch[f], *args]

思路是对的：DySample 不改通道数，输出通道 c2 = 输入通道 c1，并且在 args 前面插入 in_channels。

但是有两个关键点要注意：

DySample 不应该出现在 base_modules 里，否则永远先走 if m in base_modules: 分支，
elif m is DySample: 这一段根本不会执行到（被短路了）。
elif m is DySample: 必须放在其他 elif 分支之前不会被覆盖就行，现在位置 OK。

所以你可以修改为：

1）从 `base_modules` 里删掉 `DySample`

python
base_modules = frozenset(
    {
        Classify,
        Conv,
        CBAM,
        ConvTranspose,
        GhostConv,
        Bottleneck,
        GhostBottleneck,
        SPP,
        SPPF,
        C2fPSA,
        C2PSA,
        DWConv,
        Focus,
        BottleneckCSP,
        C1,
        C2,
        C2f,
        C3k2,
        RepNCSPELAN4,
        ELAN1,
        ADown,
        AConv,
        SPPELAN,
        C2fAttn,
        C3,
        C3TR,
        C3Ghost,
        torch.nn.ConvTranspose2d,
        DWConvTranspose2d,
        C3x,
        RepC3,
        PSA,
        SCDown,
        C2fCIB,
        A2C2f,
        LSKBlock,
        # ❌ DySample 不要放这里
        MSCA,
    }
)

2）单独给 DySample 写一个分支

紧接着其它 elif 其中之一，加上：

python
        elif m is DySample:
            # DySample 不改变通道数：c2 = c1
            c1 = ch[f]
            c2 = c1

            # yaml 里你写的是 [2, "lp"]，即 [scale, style]
            scale = args[0] if len(args) > 0 else 2
            style = args[1] if len(args) > 1 else "lp"
            groups = args[2] if len(args) > 2 else 4
            dyscope = args[3] if len(args) > 3 else False

            # DySample(in_channels, scale, style, groups, dyscope)
            args = [c1, scale, style, groups, dyscope]

注意这里 完全不走 make_divisible 那套逻辑，也不依赖 max_channels。

这样：

TypeError 那个 min(c2, max_channels) 就不会再碰到 DySample；
DySample 会正确地变成 DySample( in_channels = ch[f], scale=2, style="lp", groups=4, dyscope=False )；

你打印的日志就应该变成：

text
DySample: in_channels=1024, groups=4, style=lp, scale=2
DySample: in_channels=512,  groups=4, style=lp, scale=2
...

如果你看到 scale=8 那种，基本就是参数顺序 / 解析错位了，按上面这样拼 args 就对。

三、head 部分 yaml 的正确写法

你现在的 head：

yaml
head:
  - [-1, 1, DySample, [2, "lp"]]
  - [[-1, 6], 1, Concat, [1]]
  - [-1, 2, C3k2, [512, False]] # 13

  - [-1, 1, DySample, [2, "lp"]]
  - [[-1, 4], 1, Concat, [1]]
  - [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)

  - [-1, 1, Conv, [256, 3, 2]]
  - [[-1, 14], 1, Concat, [1]] # cat head P4   ← 你改成 14
  - [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)

  - [-1, 1, Conv, [512, 3, 2]]
  - [[-1, 11], 1, Concat, [1]] # cat head P5   ← 你改成 11
  - [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)

  - [[17, 20, 23], 1, Detect, [nc]] #  ← 你也改了 from

其实除了把 nn.Upsample 换成 DySample 以外，其它 from 索引完全不需要动，保持原版就行。
推荐改回下面这样：

yaml
head:
  # 11: P5 -> P4 上采样
  - [-1, 1, DySample, [2, "lp"]]
  - [[-1, 6], 1, Concat, [1]]          # 12, cat backbone P4
  - [-1, 2, C3k2, [512, False]]        # 13, FPN P4

  # 14: P4 -> P3 上采样
  - [-1, 1, DySample, [2, "lp"]]
  - [[-1, 4], 1, Concat, [1]]          # 15, cat backbone P3
  - [-1, 2, C3k2, [256, False]]        # 16, P3/8-small

  - [-1, 1, Conv, [256, 3, 2]]         # 17, P3 -> P4 下采样
  - [[-1, 13], 1, Concat, [1]]         # 18, cat head P4 (注意这里是 13，不是 14)
  - [-1, 2, C3k2, [512, False]]        # 19, P4/16-medium

  - [-1, 1, Conv, [512, 3, 2]]         # 20, P4 -> P5 下采样
  - [[-1, 10], 1, Concat, [1]]         # 21, cat head P5 (注意这里是 10，不是 11)
  - [-1, 2, C3k2, [1024, True]]        # 22, P5/32-large

  - [[16, 19, 22], 1, Detect, [nc]]    # 23, Detect(P3, P4, P5)

这样：

P3 由第 16 层输出 → Detect from: 16
P4 由第 19 层输出 → Detect from: 19
P5 由第 22 层输出 → Detect from: 22
Detect 构建时 i=23，ch 只到 22，不再访问 ch[23]，IndexError 消失。

四、顺带回答你说的 “他也没有 sl”

你在 yaml 里定义的：

yaml
scales:
  n: [0.50, 0.25, 1024]
  s: [0.50, 0.50, 1024]
  m: [0.50, 1.00, 512]
  l: [1.00, 1.00, 512]
  x: [1.00, 1.50, 512]

这是 YOLO11 官方的写法，规模只有 n/s/m/l/x 五种，本来就没有 sl 这个 key。
一般 YOLO('yolo11n.yaml') 会自动把 scale 设置为 'n'，然后取 scales['n']，不会去找 sl，所以这个不用担心。

五、你接下来可以直接这样检查一下

确认 DySample 没有出现在 base_modules 里，但有单独的 elif m is DySample 分支。
把 head 按我给你的版本改回（只替换 Upsample → DySample，其它 from 索引不动）。
在 DySample 的 __init__ 里保留你的 debug 打印：
```
python
print(f"DySample: in_channels={in_channels}, groups={groups}, style={style}, scale={scale}")
```
确认两层分别是 (1024, 2) 和 (512, 2) 这种组合。

如果你跑完还有新的报错，把新的栈信息和你最新的 DySample 定义贴出来，我可以帮你继续对着改。

Answer 3

先把几个关键词对上：

lp / pl：控制的是offset 生成方式（Linear+PixelShuffle vs PixelShuffle+Linear）
static / dynamic：说的是scope factor（偏移幅度的静态/动态门控），在你代码里用 dyscope 这个布尔量控制
不管 lp / pl、static / dynamic，DySample 本身都是“动态上采样”（offset 由特征算出来）

所以：“哪个是静态、哪个是动态上采样”这个说法有点混了两个维度，我们拆开讲。

1. lp 和 pl 的本质区别

对应论文里的 LP-style 和 PL-style 两种 offset 生成风格。(Learning to Upsample by Learning to Sample.pdf)

1.1 `style='lp'`：Linear + PixelShuffle（LP 风格）

你的代码里：

python
def forward_lp(self, x):
    if hasattr(self, 'scope'):
        offset = self.offset(x) * self.scope(x).sigmoid() * 0.5 + self.init_pos
    else:
        offset = self.offset(x) * 0.25 + self.init_pos
    return self.sample(x, offset)

流程是：

直接对输入特征 x 做 1×1 卷积：offset = Conv2d(in_channels, out_channels)
out_channels = 2 * groups * scale^2（比如 C=256,g=4,s=2 时，out=32）
在 sample() 里面用 pixel_shuffle + grid_sample 把这个 offset reshape 成 (2, sH, sW) 的采样网格。

特点：

参数量（offset 分支）∝ in_channels * 2 * groups * scale^2
不需要对输入做预先的 pixel_shuffle，计算图比较简单
论文里叫 LP-style（linear+pixel shuffle），偏“完整版”，参数稍多一点但推理快、显存占用小(Learning to Upsample by Learning to Sample.pdf)

1.2 `style='pl'`：PixelShuffle + Linear（PL 风格）

你的代码里：

python
def __init__(...):
    if style == 'pl':
        in_channels = in_channels // scale ** 2
        out_channels = 2 * groups
    else:
        out_channels = 2 * groups * scale ** 2
    self.offset = nn.Conv2d(in_channels, out_channels, 1)

python
def forward_pl(self, x):
    x_ = F.pixel_shuffle(x, self.scale)  # C -> C/s^2, H,W -> sH,sW
    if hasattr(self, 'scope'):
        offset = F.pixel_unshuffle(
            self.offset(x_) * self.scope(x_).sigmoid(), self.scale
        ) * 0.5 + self.init_pos
    else:
        offset = F.pixel_unshuffle(self.offset(x_), self.scale) * 0.25 + self.init_pos
    return self.sample(x, offset)

流程是：

先对 x 做 pixel_shuffle，空间从 H×W 变成 sH×sW，通道从 C 变成 C / s^2
在高分辨率上对压缩后的特征做 1×1 卷积：Conv2d(C/s^2 → 2*groups)
再 pixel_unshuffle 回到原来的低分辨率 offset 格式，交给 sample() 去用。

特点：

offset 卷积参数量 ∝ (in_channels / scale^2) * 2 * groups
对比 LP，在论文原始设计中，参数数目可以减到 1/s⁴（s=2 时直接少 16 倍）(Learning to Upsample by Learning to Sample.pdf)
但需要额外的 pixel_shuffle/pixel_unshuffle，显存和一点点延时会上去一点
论文里叫 PL-style（pixel shuffle + linear），他们发现：
- 在 SegFormer 语义分割上，PL（特别是 S / S+ 版本）表现更好；
- 在检测/实例分割等任务上，则 LP 稍微更稳定一点(Learning to Upsample by Learning to Sample.pdf)

2. 哪个是“静态”，哪个是“动态”？

都不是。
lp / pl 只决定 “offset 怎么算”，不是静态/动态。

在 DySample 里，“静态/动态”说的是 scope factor（偏移幅度的缩放因子）：

静态 scope factor（static）：
- 公式：O = 0.25 * linear(X) (Learning to Upsample by Learning to Sample.pdf)
- 你的代码里就是 没有 scope 卷积的情况：
```
python
# static
offset = self.offset(x) * 0.25 + self.init_pos
```
动态 scope factor（dynamic）：
- 公式：O = 0.5 * sigmoid(linear1(X)) * linear2(X)(Learning to Upsample by Learning to Sample.pdf)
- 你的代码里是 有 scope 卷积的情况：
```
python
# dynamic
offset = self.offset(x) * self.scope(x).sigmoid() * 0.5 + self.init_pos
```
- scope(x).sigmoid() ∈ (0,1)，乘上 0.5，等价于把 offset 范围从固定 0.25 换成一个 按位置变化的 [0, 0.5] 区间。

而你在 __init__ 里是这样写的：

python
class DySample(nn.Module):
    def __init__(..., dyscope=False):
        ...
        self.offset = nn.Conv2d(...)
        ...
        if dyscope:
            self.scope = nn.Conv2d(...)

dyscope=False → 只用 offset 那个卷积 → 静态 scope factor（论文里的 “static scope factor” 版本）(Learning to Upsample by Learning to Sample.pdf)
dyscope=True → 同时用 offset 和 scope 两个卷积 → 动态 scope factor（论文里的 “dynamic scope factor” 版本，即 DySample+ / S+）(Learning to Upsample by Learning to Sample.pdf)

总结一下：

维度	选项	你代码里的参数
offset 风格	LP / PL	`style='lp'/'pl'`
scope 类型	static / dynamic	`dyscope=False/True`

所以：

style='lp', dyscope=False → DySample（LP + static）
style='lp', dyscope=True → DySample+（LP + dynamic）
style='pl', dyscope=False → DySample-S（PL + static）
style='pl', dyscope=True → DySample-S+（PL + dynamic）

四个都是“动态上采样”（都会根据特征生成 offset），只是设计细节不同。

3. 这两个参数在 YOLOv11 里怎么调？

结合论文的 ablation 和你的 YOLO 场景，可以这么玩：(Learning to Upsample by Learning to Sample.pdf)

3.1 首选推荐（检测任务，YOLO）

建议先用：style='lp', groups=4, dyscope=True

理由：

目标检测跟 Faster R-CNN / Mask R-CNN 比较类似，论文里在这些任务上 LP 风格（DySample / DySample+）通常略优于 PL 风格 或差不多，但更省显存。(Learning to Upsample by Learning to Sample.pdf)
groups=4 是作者在 LP 风格下推荐的分组数，性能和稳定性都不错。(Learning to Upsample by Learning to Sample.pdf)
dyscope=True（动态 scope）一般能带来 +0.1 左右 AP/mIoU 的小提升，代价是增加一个额外的 1×1 卷积，但参数量很小。(Learning to Upsample by Learning to Sample.pdf)

如果你用的是 n/s 超轻量模型，担心算力/显存：

可以先用 dyscope=False，也就是 静态 scope 版本，只用一个 offset 卷积，复杂度 minimal。
训练完看看：
- 如果 mAP50 上去了但 mAP50-95 稍掉一点，很可能是你之前遇到的情况：
  整体召回好一点，高 IoU 定位精度略被扰动，可以再尝试打开 dyscope=True 看看。

3.2 什么时候考虑 `style='pl'`（PL 风格）

在下面两种场景可以尝试 PL：

你非常在意参数量：
- 对于 scale=2 的上采样，PL 风格的 offset 分支参数是 LP 的大约 1/16，在模型特别大的时候（比如很多 FPN 层都换成动态上采样）会有点收益。(Learning to Upsample by Learning to Sample.pdf)
你之后想做 segmentation / panoptic 这类任务：
- 在 SegFormer、MaskFormer 这类语义/全景分割实验里，论文发现 DySample-S / S+（PL 风格）略优于 LP。(Learning to Upsample by Learning to Sample.pdf)

但注意 PL 有两个小坑：

需要 in_channels % scale^2 == 0（你代码已经 assert 了），否则会直接报错；
多了一次 pixel_shuffle 和 pixel_unshuffle，前向和显存稍微重一点点（比 LP）(Learning to Upsample by Learning to Sample.pdf)

3.3 一个比较实用的调参路线

假设你现在在 YOLO11 上做实验，可以这样试：

Baseline：原始 bilinear / nearest 上采样，记录 mAP50 和 mAP50-95。
DySample（轻量版）
- style='lp', groups=4, dyscope=False
- 看看 mAP50 / mAP50-95 / small-object AP 有没有整体提升。
DySample+（加强版）
- 在 2 的基础上只改 dyscope=True
- 对比 static vs dynamic scope 的变化，通常 dynamic 会让高 IoU 部分恢复一点。
对比 PL 风格
- 同上配置，把 style='pl'，同时把 groups 改为 8（跟论文推荐一致）；(Learning to Upsample by Learning to Sample.pdf)
- 如果性能相近但参数明显少，你就可以考虑常用 PL。

4. 一句总结

lp / pl 是两种 offset 生成结构（LP vs PL），都属于 DySample 的“动态上采样”范畴；
静态/动态（scope） 由 dyscope 决定：False 是静态 scope，True 是动态 scope；
对 YOLOv11 来说，实战上优先推荐：
style='lp', groups=4，先试 dyscope=False，再视情况打开 dyscope=True 做对比。

如果你愿意，可以把你现在的 DySample 定义（完整类）贴一次，我可以帮你标注出哪种组合对应论文里的 DySample / DySample+ / S / S+，顺便一起看看 groups 和 scale 的配置有没有潜在坑。

Answer 4

你这个结果其实**很正常，而且挺“对味”**的 😂：

原始：P=0.923, R=0.828, mAP50=0.890, mAP50-95=0.719
DySample（style='lp', groups=4, dyscope=False）：
P=0.947, R=0.882, mAP50=0.901, mAP50-95=0.720

说明两件事：

静态 scope（DySample 原始版）已经帮 YOLOv11 把召回、粗定位搞得很好；
动态 scope（dyscope=True）和 PL 风格在你这个检测场景里反而有点“过拟合抖动”，高 IoU 反而没有占便宜——这和论文在不同任务上的结论是对得上的：在检测上 LP+static 非常强，PL 和 dynamic 未必一定更好。(Learning to Upsample by Learning to Sample.pdf)

下面我按你现在这个 “LP + static + groups=4” 作为基线，给你三个循序渐进的改进方向，每个方向都尽量控制复杂度，都是结合 DySample / DLU / CARAFE / FADE / SAPA 的思路来的。

先提前说一句：我没法“保证一定提升”，但这些改法都是针对你现在的问题（高 IoU 提升不明显）设计的，理论上是朝着 mAP50-95 方向去的，而且额外参数都很少，可以很快做对比实验。

方向 1：让静态 scope 变成“可学习的静态 scope”

问题定位

DySample 论文里静态 scope 因子固定是 0.25：

O = 0.25\cdot \text{linear}(X)

这是在 SegFormer/Faster-RCNN 上通过表 2 的 ablation 找到的最优值。(Learning to Upsample by Learning to Sample.pdf)

但：

这个 0.25 是在 别的任务+模型 上调出来的；
你的 YOLOv11 + 数据集，最佳可能是 0.2 或 0.3，甚至不同层最优不一样；
你现在直接用 0.25 就已经有提升，那就很有可能还能再榨一点点 juice 出来。

具体做法

在当前 DySample 类里面加一个（每层一个的）可学习 scope 因子 α，但仍然限制在 [0, 0.5] 这个安全区间内：

python
class DySample(nn.Module):
    def __init__(self, in_channels, scale=2, style='lp', groups=4,
                 dyscope=False, learnable_scope=False):
        super().__init__()
        ...
        self.learnable_scope = learnable_scope
        if learnable_scope:
            # 初始化成 sigmoid(logit_alpha) ≈ 0.25
            init_alpha = 0.25
            import math
            logit = math.log(init_alpha / (0.5 - init_alpha))  # 映射到 (0,0.5)
            self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float))
        else:
            self.register_buffer('alpha', torch.tensor(0.25, dtype=torch.float))
        ...
    
    def get_alpha(self, x):
        if self.learnable_scope:
            # 映射到 (0, 0.5)
            return 0.5 * torch.sigmoid(self.logit_alpha)
        else:
            return self.alpha

    def forward_lp(self, x):
        alpha = self.get_alpha(x)
        if hasattr(self, 'scope'):
            # 如果以后你想再试动态 scope，这里用 2*alpha 代替原来 0.5
            offset = self.offset(x) * self.scope(x).sigmoid() * (2 * alpha) + self.init_pos
        else:
            # 这里用 alpha 代替 0.25
            offset = self.offset(x) * alpha + self.init_pos
        return self.sample(x, offset)

然后 YOLO 里你现在的两层 DySample 就写成：

yaml
- [-1, 1, DySample, [2, "lp", 4, False, True]]  # scale, style, groups, dyscope, learnable_scope

预期效果：

不会改变“静态 scope”这一整体风格，只是让 0.25 变成数据驱动的最优值；

理论上更有利于在高 IoU（mAP75~95）附近找到最佳平衡点，mAP50-95 有望比现在的 0.720 再抬一点点。

方向 2：借鉴 DLU 的“内容编码器”，给 offset 多一点上下文

看过你贴的 DLU 代码你应该注意到了，它在生成 kernel / mask 之前会做一套：

channel_compressor(1×1) +
kernel_space_generator(3×3, dilation)

本质上是一个局部内容编码器：在生成上采样核/权重之前，先用 3×3 卷积把局部空间信息揉一遍，这样比单纯的 1×1 更容易理解边界 / 纹理。

DySample 目前的 offset 生成只有一个 1×1 conv：

O = \text{Conv}_{1\times1}(X)

你可以照着 DLU 的思路，给 DySample 的 offset 分支也加一个轻量 content encoder，但仍然保持单输入、纯 PyTorch、用 grid_sample 的流程。(Learning to Upsample by Learning to Sample.pdf)

改进版 DySample（只动 offset 分支）

python
class DySampleV2(nn.Module):
    def __init__(self, in_channels, scale=2, style='lp', groups=4,
                 dyscope=False, encoder_kernel=3, encoder_dilation=1,
                 compressed_channels=None, learnable_scope=False):
        super().__init__()
        self.scale = scale
        self.style = style
        self.groups = groups

        if style == 'pl':
            assert in_channels >= scale ** 2 and in_channels % scale ** 2 == 0
        assert in_channels >= groups and in_channels % groups == 0

        # --------- DLU 风格的内容编码器 ----------
        mid = compressed_channels or max(in_channels // 2, 16)
        self.compress = nn.Conv2d(in_channels, mid, 1)
        self.encoder = nn.Conv2d(
            mid,
            in_channels,   # 保持通道数不变，后面仍然喂给 offset/scope 分支
            encoder_kernel,
            padding=int((encoder_kernel - 1) * encoder_dilation / 2),
            dilation=encoder_dilation,
            groups=1,
            bias=True,
        )

        if style == 'pl':
            in_for_offset = in_channels // scale ** 2
            out_channels = 2 * groups
        else:
            in_for_offset = in_channels
            out_channels = 2 * groups * scale ** 2

        self.offset = nn.Conv2d(in_for_offset, out_channels, 1)
        normal_init(self.offset, std=0.001)

        if dyscope:
            self.scope = nn.Conv2d(in_for_offset, out_channels, 1, bias=False)
            constant_init(self.scope, val=0.)

        self.learnable_scope = learnable_scope
        if learnable_scope:
            import math
            logit = math.log(0.25 / (0.5 - 0.25))
            self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float))
        else:
            self.register_buffer('alpha', torch.tensor(0.25, dtype=torch.float))

        self.register_buffer('init_pos', self._init_pos())

    def get_alpha(self):
        if self.learnable_scope:
            return 0.5 * torch.sigmoid(self.logit_alpha)
        else:
            return self.alpha

    def encode(self, x):
        x = self.compress(x)
        x = F.relu(x, inplace=True)
        x = self.encoder(x)
        return x

    def forward_lp(self, x):
        x_enc = self.encode(x)
        alpha = self.get_alpha()
        if hasattr(self, 'scope'):
            offset = self.offset(x_enc) * self.scope(x_enc).sigmoid() * (2 * alpha) + self.init_pos
        else:
            offset = self.offset(x_enc) * alpha + self.init_pos
        return self.sample(x, offset)

    def forward_pl(self, x):
        x_enc = self.encode(x)
        x_ = F.pixel_shuffle(x_enc, self.scale)
        alpha = self.get_alpha()
        if hasattr(self, 'scope'):
            offset = F.pixel_unshuffle(self.offset(x_) * self.scope(x_).sigmoid(), self.scale) * (2 * alpha) + self.init_pos
        else:
            offset = F.pixel_unshuffle(self.offset(x_), self.scale) * alpha + self.init_pos
        return self.sample(x, offset)

    def forward(self, x):
        if self.style == 'pl':
            return self.forward_pl(x)
        return self.forward_lp(x)

UL 实际用的时候，你可以先这样配：

只替换最顶上那一层 P5→P4 的 DySample：
encoder_kernel=3, encoder_dilation=2, compressed_channels=64
P4→P3 那一层先保持原版 DySample，避免一次改太多。

直觉：

检测里的高 IoU 问题，多半是 FPN 顶层语义信息+边界信息没融合好；

用 3×3/dilated conv 做“内容编码器”，可以把更多局部上下文融进 offset，理论上更有利于边界附近的采样位置选择，从而提升 mAP75~95。

方向 3：加一个轻量的“残差门控”，在双线性和 DySample 之间做自适应融合

这是借鉴 FADE/SAPA 和 DLU 里“mask / gate” 的思路：

它们会生成一个局部的权重，在不同位置决定「更信任哪个分支」。(Learning to Upsample by Learning to Sample.pdf)
我们不搞复杂的双输入动态核，只搞一个 单通道 gate，在 “bilinear 输出” 和 “DySample 输出” 之间插。

模块结构

在 DySample 里加一个可选的 gate 分支：

python
class DySampleV2(nn.Module):
    def __init__(..., use_gate=False, gate_ratio=4, ...):
        ...
        self.use_gate = use_gate
        if use_gate:
            hidden = max(in_channels // gate_ratio, 8)
            self.gate = nn.Sequential(
                nn.Conv2d(in_channels, hidden, 1),
                nn.ReLU(inplace=True),
                nn.Conv2d(hidden, 1, 1)
            )
        ...

    def forward(self, x):
        # 先算动态上采样
        if self.style == 'pl':
            y_dyn = self.forward_pl(x)
        else:
            y_dyn = self.forward_lp(x)

        if self.use_gate:
            # 再算一个标准双线性上采样作为“稳定基线”
            y_bi = F.interpolate(x, scale_factor=self.scale, mode='bilinear', align_corners=False)
            g = torch.sigmoid(self.gate(x))  # shape: [B,1,H,W]，再自动 broadcast

            # 残差方式融合：gate=0 → 纯 bilinear，gate=1 → 纯 DySample
            y = y_bi + g * (y_dyn - y_bi)
            return y

        return y_dyn

直观理解：

内部大面积平坦区域，网络可以学到 gate ≈ 0，多用稳定的 bilinear；

边界、纹理复杂区域，gate 会变大，更多地依赖 DySample 的动态采样；

这样对 高 IoU（精细定位） 更友好，因为不会在没必要的地方乱采样。

在 YOLO11 head 里，你可以只在 P4→P3 这一层打开 use_gate=True，顶层先不用 gate，这样：

顶层 DySample 专心做“语义重采样”；
底层那层 gate 帮你把边界细节稳一稳。

实际落地顺序建议（给你一个实验 plan）

如果你时间有限，我建议按这个顺序来做：

DySample + learnable_scope（方向 1）
- 配置：style='lp', groups=4, dyscope=False, learnable_scope=True
- 看一眼 mAP50-95 有没有稳定高于 0.720（哪怕 +0.005 都是好事）。
在 P5→P4 那一层换成 DySampleV2（方向 2），带内容编码器
- encoder_kernel=3, encoder_dilation=2, compressed_channels=64, learnable_scope=True
- P4→P3 先继续用你的老 DySample。
在 P4→P3 打开 gate（方向 3）
- use_gate=True, gate_ratio=4
- 这一步专门看高 IoU 指标（mAP75、mAP50-95）有没有明显改善。

如果你愿意，之后还可以再玩：

不同层 groups ≠ 4，比如 P5 用 4 组，P4 用 2 组；
部分层尝试 style='pl'，只在高分辨率层（接近输入大小的那一层）用 PL-S 风格，其他层保持 LP；(Learning to Upsample by Learning to Sample.pdf)

如果你把上面某个版本的完整 DySampleV2 代码和更新后的 yaml 再贴一次，我可以帮你逐行检查一下参数顺序和 shape，确保不会再出现那种很烦的构建期报错，然后你就可以专心看指标变化了。

Answer 5

OK，完全理解你的意思 👍：

yaml 一律保持： DySample, [2, "lp"]
parse_model 里也只传 [c1, 2, "lp"]，不再加多余参数
我给你 3 个版本的完整 DySample 类代码，
你要用哪个版本，就把工程里的 DySample 类替换成哪个（一次只留一个）。

下面 3 个方向都满足这个接口：

python
class DySample(nn.Module):
    def __init__(self, in_channels, scale=2, style='lp'):
        ...

也就是：

in_channels 由 parse_model 自动填
scale=2、style="lp" 从 yaml 里来的
其它东西（groups=4、是否 learnable / encoder / gate）都在类内部写死，不需要 yaml 传参。

公共小工具（可以放在某个 modules 文件顶部）

如果你原来已经有 normal_init / constant_init，可以跳过这段。

python
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


def normal_init(module, mean=0, std=1, bias=0):
    if hasattr(module, "weight") and module.weight is not None:
        nn.init.normal_(module.weight, mean, std)
    if hasattr(module, "bias") and module.bias is not None:
        nn.init.constant_(module.bias, bias)


def constant_init(module, val, bias=0):
    if hasattr(module, "weight") and module.weight is not None:
        nn.init.constant_(module.weight, val)
    if hasattr(module, "bias") and module.bias is not None:
        nn.init.constant_(module.bias, bias)

版本一：DySample + 可学习静态 scope（最简单，直接在你现在最好结果上优化）

特点：

style='lp'/'pl' 都支持
groups 固定为 4，在类里面写死
没有动态 scope 卷积，但把原来固定的 0.25 换成 可学习的 α ∈ (0, 0.5)
你现在效果最好的是：lp + groups=4 + dyscope=False，这个版本就是在此基础上“微调 α”

python
class DySample(nn.Module):
    """
    版本1：LP/PL + learnable static scope
    - yaml 调用: DySample, [2, "lp"]
    - groups=4 固定
    - 没有 scope 卷积，只有一个 learnable alpha (0,0.5)
    """

    def __init__(self, in_channels, scale=2, style="lp"):
        super().__init__()
        self.scale = scale
        self.style = style
        self.groups = 4  # 固定 4 组

        assert style in ["lp", "pl"]
        if style == "pl":
            assert in_channels >= scale**2 and in_channels % scale**2 == 0
        assert in_channels >= self.groups and in_channels % self.groups == 0

        if style == "pl":
            conv_in_channels = in_channels // (scale**2)
            out_channels = 2 * self.groups
        else:
            conv_in_channels = in_channels
            out_channels = 2 * self.groups * (scale**2)

        # offset 卷积
        self.offset = nn.Conv2d(conv_in_channels, out_channels, 1)
        normal_init(self.offset, std=0.001)

        # learnable 的静态 scope 因子 alpha ∈ (0, 0.5)
        init_alpha = 0.25
        logit = math.log(init_alpha / (0.5 - init_alpha))
        self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float))

        # 初始采样位置
        self.register_buffer("init_pos", self._init_pos())

    # 获取 alpha，范围 (0, 0.5)
    def _get_alpha(self):
        return 0.5 * torch.sigmoid(self.logit_alpha)

    def _init_pos(self):
        # [1, 2*groups*scale^2, 1, 1]
        h = torch.arange(
            (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1, dtype=torch.float32
        )
        # meshgrid，注意 indexing 参数避免 warning
        yy, xx = torch.meshgrid(h, h, indexing="ij")  # (s, s)
        pos = torch.stack([xx, yy], dim=0)  # (2, s, s)
        pos = pos.transpose(1, 2).repeat(1, self.groups, 1)  # (2, s*groups, s)
        pos = pos.reshape(1, -1, 1, 1)  # (1, 2*groups*s, 1, 1)
        return pos

    def sample(self, x, offset):
        B, _, H, W = offset.shape
        # [B, 2, groups*scale^2, H, W]
        offset = offset.view(B, 2, -1, H, W)

        coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5
        coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5

        # coords_w: width, coords_h: height
        grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij")  # (W, H)
        coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2)  # (2, H, W)
        coords = coords.unsqueeze(0).unsqueeze(1)  # (1,1,2,H,W)

        normalizer = torch.tensor(
            [W, H], dtype=x.dtype, device=x.device
        ).view(1, 2, 1, 1, 1)
        coords = 2 * (coords + offset) / normalizer - 1  # 归一化到 [-1,1]

        # 像原论文一样，用 pixel_shuffle 展开到高分辨率采样点
        coords = F.pixel_shuffle(
            coords.view(B, -1, H, W), self.scale
        )  # (B, 2*groups, sH, sW)
        coords = (
            coords.view(B, 2, -1, self.scale * H, self.scale * W)
            .permute(0, 2, 3, 4, 1)
            .contiguous()
            .flatten(0, 1)
        )  # (B*groups, sH, sW, 2)

        # 对每个 group 做 grid_sample
        return F.grid_sample(
            x.reshape(B * self.groups, -1, H, W),
            coords,
            mode="bilinear",
            align_corners=False,
            padding_mode="border",
        ).view(B, -1, self.scale * H, self.scale * W)

    def forward_lp(self, x):
        alpha = self._get_alpha()
        offset = self.offset(x) * alpha + self.init_pos.to(
            x.dtype
        )  # broadcast 到 [B, C, H, W]
        return self.sample(x, offset)

    def forward_pl(self, x):
        x_ = F.pixel_shuffle(x, self.scale)
        alpha = self._get_alpha()
        offset = (
            F.pixel_unshuffle(self.offset(x_), self.scale) * alpha
            + self.init_pos.to(x.dtype)
        )
        return self.sample(x, offset)

    def forward(self, x):
        if self.style == "pl":
            return self.forward_pl(x)
        return self.forward_lp(x)

版本二：在版本一基础上 + DLU 风格“内容编码器”

特点：

在生成 offset 之前先做一层 1×1 压缩 + 3×3 编码（模仿 DLU 的 channel_compressor + kernel_space_generator 思路）
仍然是单输入、grid_sample 动态上采样，不需要 CUDA op
参数额外增加不多（每层几十 K 级别），但对边界/上下文感知更强，理论上对 mAP50-95 更友好

python
class DySample(nn.Module):
    """
    版本2：版本1 + DLU 风格内容编码器
    - yaml 调用: DySample, [2, "lp"]
    - groups=4 固定
    - learnable static scope alpha
    - 在 offset 之前加: 1x1 通道压缩 + 3x3 编码
    """

    def __init__(self, in_channels, scale=2, style="lp"):
        super().__init__()
        self.scale = scale
        self.style = style
        self.groups = 4

        assert style in ["lp", "pl"]
        if style == "pl":
            assert in_channels >= scale**2 and in_channels % scale**2 == 0
        assert in_channels >= self.groups and in_channels % self.groups == 0

        # ---- DLU 风格内容编码器：compress + 3x3 encoder ----
        mid_channels = max(in_channels // 2, 16)
        self.compress = nn.Conv2d(in_channels, mid_channels, 1)
        self.encoder = nn.Conv2d(
            mid_channels,
            in_channels,
            kernel_size=3,
            padding=1,
            dilation=1,
            bias=True,
        )

        # offset 分支输入通道数（可能是 pl 风格下的 C/s^2）
        if style == "pl":
            conv_in_channels = in_channels // (scale**2)
            out_channels = 2 * self.groups
        else:
            conv_in_channels = in_channels
            out_channels = 2 * self.groups * (scale**2)

        self.offset = nn.Conv2d(conv_in_channels, out_channels, 1)
        normal_init(self.offset, std=0.001)

        # learnable alpha
        init_alpha = 0.25
        logit = math.log(init_alpha / (0.5 - init_alpha))
        self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float))

        self.register_buffer("init_pos", self._init_pos())

    def _get_alpha(self):
        return 0.5 * torch.sigmoid(self.logit_alpha)

    def _init_pos(self):
        h = torch.arange(
            (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1, dtype=torch.float32
        )
        yy, xx = torch.meshgrid(h, h, indexing="ij")
        pos = torch.stack([xx, yy], dim=0)
        pos = pos.transpose(1, 2).repeat(1, self.groups, 1)
        pos = pos.reshape(1, -1, 1, 1)
        return pos

    def sample(self, x, offset):
        B, _, H, W = offset.shape
        offset = offset.view(B, 2, -1, H, W)

        coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5
        coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5
        grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij")
        coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2)
        coords = coords.unsqueeze(0).unsqueeze(1)

        normalizer = torch.tensor(
            [W, H], dtype=x.dtype, device=x.device
        ).view(1, 2, 1, 1, 1)
        coords = 2 * (coords + offset) / normalizer - 1

        coords = F.pixel_shuffle(
            coords.view(B, -1, H, W), self.scale
        )  # (B, 2*groups, sH, sW)
        coords = (
            coords.view(B, 2, -1, self.scale * H, self.scale * W)
            .permute(0, 2, 3, 4, 1)
            .contiguous()
            .flatten(0, 1)
        )

        return F.grid_sample(
            x.reshape(B * self.groups, -1, H, W),
            coords,
            mode="bilinear",
            align_corners=False,
            padding_mode="border",
        ).view(B, -1, self.scale * H, self.scale * W)

    def encode(self, x):
        x1 = self.compress(x)
        x1 = F.relu(x1, inplace=True)
        x1 = self.encoder(x1)
        return x1

    def _forward_lp_core(self, x):
        x_enc = self.encode(x)
        alpha = self._get_alpha()
        offset = self.offset(x_enc) * alpha + self.init_pos.to(x.dtype)
        return self.sample(x, offset)

    def _forward_pl_core(self, x):
        x_enc = self.encode(x)
        x_ = F.pixel_shuffle(x_enc, self.scale)
        alpha = self._get_alpha()
        offset = (
            F.pixel_unshuffle(self.offset(x_), self.scale) * alpha
            + self.init_pos.to(x.dtype)
        )
        return self.sample(x, offset)

    def forward(self, x):
        if self.style == "pl":
            return self._forward_pl_core(x)
        return self._forward_lp_core(x)

版本三：在版本二基础上 + gate（在 bilinear 和 DySample 之间自适应融合）

特点：

在版本二的基础上再多一个 gate 分支
gate 输入是原始 x，输出一个 [B,1,H,W] 的图，再插值到 [B,1,sH,sW]
输出：y = y_bilinear + g * (y_dynamic - y_bilinear)
- 大区域 /平坦区域 gate ≈ 0 → 接近普通双线性
- 边缘/纹理 gate ≈ 1 → 更依赖动态上采样
这个版本就是“最激进”的一个：内容编码 + learnable scope + gate，全都上

python
class DySample(nn.Module):
    """
    版本3：版本2 + gate（在 bilinear 与 dynamic 之间做自适应融合）
    - yaml 调用: DySample, [2, "lp"]
    - groups=4 固定
    - learnable static scope alpha
    - DLU 风格内容编码器
    - gate: y = y_bi + g * (y_dyn - y_bi)
    """

    def __init__(self, in_channels, scale=2, style="lp"):
        super().__init__()
        self.scale = scale
        self.style = style
        self.groups = 4

        assert style in ["lp", "pl"]
        if style == "pl":
            assert in_channels >= scale**2 and in_channels % scale**2 == 0
        assert in_channels >= self.groups and in_channels % self.groups == 0

        # ---- DLU 风格内容编码器 ----
        mid_channels = max(in_channels // 2, 16)
        self.compress = nn.Conv2d(in_channels, mid_channels, 1)
        self.encoder = nn.Conv2d(
            mid_channels,
            in_channels,
            kernel_size=3,
            padding=1,
            dilation=1,
            bias=True,
        )

        # offset 分支
        if style == "pl":
            conv_in_channels = in_channels // (scale**2)
            out_channels = 2 * self.groups
        else:
            conv_in_channels = in_channels
            out_channels = 2 * self.groups * (scale**2)

        self.offset = nn.Conv2d(conv_in_channels, out_channels, 1)
        normal_init(self.offset, std=0.001)

        # learnable alpha
        init_alpha = 0.25
        logit = math.log(init_alpha / (0.5 - init_alpha))
        self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float))

        # gate 分支：从 x 生成 [B,1,H,W] gate
        hidden = max(in_channels // 4, 8)
        self.gate_conv1 = nn.Conv2d(in_channels, hidden, 1)
        self.gate_conv2 = nn.Conv2d(hidden, 1, 1)

        self.register_buffer("init_pos", self._init_pos())

    def _get_alpha(self):
        return 0.5 * torch.sigmoid(self.logit_alpha)

    def _init_pos(self):
        h = torch.arange(
            (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1, dtype=torch.float32
        )
        yy, xx = torch.meshgrid(h, h, indexing="ij")
        pos = torch.stack([xx, yy], dim=0)
        pos = pos.transpose(1, 2).repeat(1, self.groups, 1)
        pos = pos.reshape(1, -1, 1, 1)
        return pos

    def sample(self, x, offset):
        B, _, H, W = offset.shape
        offset = offset.view(B, 2, -1, H, W)

        coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5
        coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5
        grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij")
        coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2)
        coords = coords.unsqueeze(0).unsqueeze(1)

        normalizer = torch.tensor(
            [W, H], dtype=x.dtype, device=x.device
        ).view(1, 2, 1, 1, 1)
        coords = 2 * (coords + offset) / normalizer - 1

        coords = F.pixel_shuffle(
            coords.view(B, -1, H, W), self.scale
        )  # (B, 2*groups, sH, sW)
        coords = (
            coords.view(B, 2, -1, self.scale * H, self.scale * W)
            .permute(0, 2, 3, 4, 1)
            .contiguous()
            .flatten(0, 1)
        )

        return F.grid_sample(
            x.reshape(B * self.groups, -1, H, W),
            coords,
            mode="bilinear",
            align_corners=False,
            padding_mode="border",
        ).view(B, -1, self.scale * H, self.scale * W)

    def encode(self, x):
        x1 = self.compress(x)
        x1 = F.relu(x1, inplace=True)
        x1 = self.encoder(x1)
        return x1

    # 动态分支核心（不含 gate）
    def _forward_lp_core(self, x):
        x_enc = self.encode(x)
        alpha = self._get_alpha()
        offset = self.offset(x_enc) * alpha + self.init_pos.to(x.dtype)
        return self.sample(x, offset)

    def _forward_pl_core(self, x):
        x_enc = self.encode(x)
        x_ = F.pixel_shuffle(x_enc, self.scale)
        alpha = self._get_alpha()
        offset = (
            F.pixel_unshuffle(self.offset(x_), self.scale) * alpha
            + self.init_pos.to(x.dtype)
        )
        return self.sample(x, offset)

    def forward(self, x):
        # 先算动态上采样
        if self.style == "pl":
            y_dyn = self._forward_pl_core(x)
        else:
            y_dyn = self._forward_lp_core(x)

        # 再算一个普通 bilinear 作为基线
        y_bi = F.interpolate(
            x, scale_factor=self.scale, mode="bilinear", align_corners=False
        )

        # gate from x -> [B,1,H,W] -> upsample 到 [B,1,sH,sW]
        g = self.gate_conv1(x)
        g = F.relu(g, inplace=True)
        g = self.gate_conv2(g)
        g = torch.sigmoid(g)  # [B,1,H,W]
        g_up = F.interpolate(
            g, scale_factor=self.scale, mode="bilinear", align_corners=False
        )  # [B,1,sH,sW]

        # 残差融合：gate=0 时等于 bilinear，gate=1 时等于 DySample
        y = y_bi + g_up * (y_dyn - y_bi)
        return y

小提示（跟你当前工程对齐）

yaml 保持不动（就你发的那段）：

yaml
head:
  - [-1, 1, DySample, [2, "lp"]]
  - [[-1, 6], 1, Concat, [1]]
  - [-1, 2, C3k2, [512, False]]

  - [-1, 1, DySample, [2, "lp"]]
  - [[-1, 4], 1, Concat, [1]]
  - [-1, 2, C3k2, [256, False]]

parse_model 里 DySample 分支可以保持你原来的简单写法（只要不放进 base_modules 就行）：

python
elif m is DySample:
    c1 = ch[f]
    c2 = c1
    scale = args[0] if len(args) > 0 else 2
    style = args[1] if len(args) > 1 else "lp"
    args = [c1, scale, style]

三个版本里 同一时间只能保留一个 class DySample，
换实验就把对应的版本覆盖掉之前的。

你可以先从 版本1 开始，因为它离你目前“最优结果”的结构最近，改动最小；
如果 mAP50-95 有提升，再试版本2、3 看看能不能继续往上抬。

如果你跑完某个版本的日志（特别是每层 DySample 的 in_channels、shape 有输出）想一起检查，我们可以再把 forward 里的 debug print 打开，一起对一下有没有哪里 shape 不对。

Answer 6

这个报错其实很精准地在骂我 😅：

The size of tensor a (2) must match the size of tensor b (16) at non-singleton dimension 2

意思就是：在

python
coords = 2 * (coords + offset) / normalizer - 1

这里，coords 和 offset 在 第 2 维（下标 2）一个是 2，一个是 16，不能广播。

根本原因：
我在给你写新版本的时候，把原论文实现里 unsqueeze 的顺序改错了，导致 coords 形状从原来的 (1, 2, 1, H, W) 变成了 (1, 1, 2, H, W)，和 offset (B, 2, 16, H, W) 对不上。

解决办法：直接把 _init_pos 和 sample() 完全恢复成论文原版那一套，错误就没了。
我们只在 offset 生成那块做改动，不动 meshgrid 和 reshape。

下面我给你 三个版本的 DySample 完整代码，都满足：

构造函数只有： __init__(self, in_channels, scale=2, style='lp')

yaml 保持不变：

yaml
head:
  - [-1, 1, DySample, [2, "lp"]]
  - [[-1, 6], 1, Concat, [1]]
  - [-1, 2, C3k2, [512, False]]

  - [-1, 1, DySample, [2, "lp"]]
  - [[-1, 4], 1, Concat, [1]]
  - [-1, 2, C3k2, [256, False]]

parse_model 里 DySample 分支写成：

python
elif m is DySample:
    c1 = ch[f]
    c2 = c1
    scale = args[0] if len(args) > 0 else 2
    style = args[1] if len(args) > 1 else "lp"
    args = [c1, scale, style]

_init_pos 和 sample() 完全照论文原始实现（只多加了 dtype / device 处理，不改维度顺序），这样就不会再出现那个 2 vs 16 的形状冲突。

一次只能保留一个 class DySample，你要试版本 1/2/3 时，把原来的 DySample 整个删掉，换成下面对应的那一段。

公共工具（如果你工程里已经有就不用重复）

python
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


def normal_init(module, mean=0, std=1, bias=0):
    if hasattr(module, "weight") and module.weight is not None:
        nn.init.normal_(module.weight, mean, std)
    if hasattr(module, "bias") and module.bias is not None:
        nn.init.constant_(module.bias, bias)


def constant_init(module, val, bias=0):
    if hasattr(module, "weight") and module.weight is not None:
        nn.init.constant_(module.weight, val)
    if hasattr(module, "bias") and module.bias is not None:
        nn.init.constant_(module.bias, bias)

版本 1：在你现在最优的结构上，加“可学习静态 scope 系数 α”

对应论文里 LP/PL + static scope factor，原来写死 0.25，现在变成可学习 α∈(0,0.5)。
其它逻辑完全跟你之前效果最好的版本一样，groups=4，没有动态 scope 卷积。

python
class DySample(nn.Module):
    """
    版本1：LP/PL + learnable static scope (没有 scope 卷积)
    - YOLO yaml: DySample, [2, "lp"]
    - groups = 4 固定写死在类内部
    """

    def __init__(self, in_channels, scale=2, style="lp"):
        super().__init__()
        self.scale = scale
        self.style = style
        self.groups = 4

        assert style in ["lp", "pl"]
        if style == "pl":
            assert in_channels >= scale ** 2 and in_channels % (scale ** 2) == 0
            conv_in_channels = in_channels // (scale ** 2)
            out_channels = 2 * self.groups
        else:
            conv_in_channels = in_channels
            out_channels = 2 * self.groups * (scale ** 2)

        # offset 卷积
        self.offset = nn.Conv2d(conv_in_channels, out_channels, 1)
        normal_init(self.offset, std=0.001)

        # learnable 的静态 scope 因子 alpha ∈ (0, 0.5)，初始化在 0.25
        init_alpha = 0.25
        logit = math.log(init_alpha / (0.5 - init_alpha))
        self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float))

        # 初始采样位置（完全照原始实现）
        self.register_buffer("init_pos", self._init_pos())

    # ========== 这两个函数完全保持原版形状逻辑 ==========

    def _init_pos(self):
        # h: [- (s-1)/2, ..., (s-1)/2] / s
        h = torch.arange(
            (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1
        ) / self.scale  # (s,)
        # torch.meshgrid([h, h]) -> (2, s, s)
        pos = torch.stack(torch.meshgrid([h, h])).transpose(1, 2)  # (2, s, s)
        pos = pos.repeat(1, self.groups, 1)  # (2, s*groups, s)
        pos = pos.reshape(1, -1, 1, 1)  # (1, 2*groups*s, 1, 1)
        return pos

    def sample(self, x, offset):
        """
        x:      [B, C, H, W]
        offset: [B, 2*groups*s^2, H, W]
        """
        B, _, H, W = offset.shape
        offset = offset.view(B, 2, -1, H, W)  # [B, 2, groups*s^2, H, W]

        coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5
        coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5

        # 原版写法，注意 unsqueeze 顺序不能改！
        coords = torch.stack(torch.meshgrid([coords_w, coords_h]))  # (2, W, H)
        coords = (
            coords.transpose(1, 2)  # (2, H, W)
            .unsqueeze(1)           # (2, 1, H, W)
            .unsqueeze(0)           # (1, 2, 1, H, W)
            .to(device=x.device, dtype=x.dtype)
        )

        normalizer = torch.tensor(
            [W, H], dtype=x.dtype, device=x.device
        ).view(1, 2, 1, 1, 1)

        # 这里之前报错，就是因为 coords 维度写坏了，现在恢复原版
        coords = 2 * (coords + offset) / normalizer - 1  # [B, 2, groups*s^2, H, W]

        # 展开到高分辨率采样点
        coords = F.pixel_shuffle(
            coords.view(B, -1, H, W), self.scale
        ).view(
            B, 2, -1, self.scale * H, self.scale * W
        ).permute(
            0, 2, 3, 4, 1
        ).contiguous().flatten(
            0, 1
        )  # [B*groups, sH, sW, 2]

        # 每个 group 做 grid_sample
        return F.grid_sample(
            x.reshape(B * self.groups, -1, H, W),
            coords,
            mode="bilinear",
            align_corners=False,
            padding_mode="border",
        ).view(B, -1, self.scale * H, self.scale * W)

    # =====================================================

    def _get_alpha(self):
        # alpha ∈ (0, 0.5)
        return 0.5 * torch.sigmoid(self.logit_alpha)

    def forward_lp(self, x):
        alpha = self._get_alpha()
        # conv_in_channels = in_channels, 所以可以直接 self.offset(x)
        offset = self.offset(x) * alpha + self.init_pos.to(x.dtype)
        return self.sample(x, offset)

    def forward_pl(self, x):
        # PL 风格：先像素重排到高分辨率再 conv，再 pixel_unshuffle 回低分辨率 offset
        x_ = F.pixel_shuffle(x, self.scale)
        alpha = self._get_alpha()
        offset = (
            F.pixel_unshuffle(self.offset(x_), self.scale) * alpha
            + self.init_pos.to(x.dtype)
        )
        return self.sample(x, offset)

    def forward(self, x):
        if self.style == "pl":
            return self.forward_pl(x)
        return self.forward_lp(x)

版本 2：在版本 1 基础上，加 DLU 风格的内容编码器（compress + 3×3 conv）

在生成 offset 前，先做 1×1 通道压缩 + 3×3 编码，更关注局部结构。
其它接口完全一样，也没有 gate，没有动态 scope。

python
class DySample(nn.Module):
    """
    版本2：版本1 + DLU 风格内容编码器
    - YOLO yaml: DySample, [2, "lp"]
    - groups = 4 固定
    - learnable static scope alpha
    - offset 之前加 compress + 3x3 encoder
    """

    def __init__(self, in_channels, scale=2, style="lp"):
        super().__init__()
        self.scale = scale
        self.style = style
        self.groups = 4

        assert style in ["lp", "pl"]
        if style == "pl":
            assert in_channels >= scale ** 2 and in_channels % (scale ** 2) == 0
        assert in_channels >= self.groups and in_channels % self.groups == 0

        # DLU 风格内容编码器：1x1 压缩 + 3x3 编码
        mid_channels = max(in_channels // 2, 16)
        self.compress = nn.Conv2d(in_channels, mid_channels, 1)
        self.encoder = nn.Conv2d(
            mid_channels,
            in_channels,
            kernel_size=3,
            padding=1,
            dilation=1,
            bias=True,
        )

        # offset 卷积输入
        if style == "pl":
            conv_in_channels = in_channels // (scale ** 2)
            out_channels = 2 * self.groups
        else:
            conv_in_channels = in_channels
            out_channels = 2 * self.groups * (scale ** 2)

        self.offset = nn.Conv2d(conv_in_channels, out_channels, 1)
        normal_init(self.offset, std=0.001)

        # learnable alpha
        init_alpha = 0.25
        logit = math.log(init_alpha / (0.5 - init_alpha))
        self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float))

        self.register_buffer("init_pos", self._init_pos())

    # === 原版 _init_pos 和 sample（保持不动） ===

    def _init_pos(self):
        h = torch.arange(
            (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1
        ) / self.scale
        pos = torch.stack(torch.meshgrid([h, h])).transpose(1, 2)
        pos = pos.repeat(1, self.groups, 1).reshape(1, -1, 1, 1)
        return pos

    def sample(self, x, offset):
        B, _, H, W = offset.shape
        offset = offset.view(B, 2, -1, H, W)

        coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5
        coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5

        coords = torch.stack(torch.meshgrid([coords_w, coords_h]))
        coords = (
            coords.transpose(1, 2)
            .unsqueeze(1)
            .unsqueeze(0)
            .to(device=x.device, dtype=x.dtype)
        )

        normalizer = torch.tensor(
            [W, H], dtype=x.dtype, device=x.device
        ).view(1, 2, 1, 1, 1)

        coords = 2 * (coords + offset) / normalizer - 1

        coords = F.pixel_shuffle(
            coords.view(B, -1, H, W), self.scale
        ).view(
            B, 2, -1, self.scale * H, self.scale * W
        ).permute(
            0, 2, 3, 4, 1
        ).contiguous().flatten(
            0, 1
        )

        return F.grid_sample(
            x.reshape(B * self.groups, -1, H, W),
            coords,
            mode="bilinear",
            align_corners=False,
            padding_mode="border",
        ).view(B, -1, self.scale * H, self.scale * W)

    # ==========================================

    def _get_alpha(self):
        return 0.5 * torch.sigmoid(self.logit_alpha)

    def encode(self, x):
        x1 = self.compress(x)
        x1 = F.relu(x1, inplace=True)
        x1 = self.encoder(x1)
        return x1

    def _forward_lp_core(self, x):
        x_enc = self.encode(x)
        alpha = self._get_alpha()
        offset = self.offset(x_enc) * alpha + self.init_pos.to(x.dtype)
        return self.sample(x, offset)

    def _forward_pl_core(self, x):
        x_enc = self.encode(x)
        x_ = F.pixel_shuffle(x_enc, self.scale)
        alpha = self._get_alpha()
        offset = (
            F.pixel_unshuffle(self.offset(x_), self.scale) * alpha
            + self.init_pos.to(x.dtype)
        )
        return self.sample(x, offset)

    def forward(self, x):
        if self.style == "pl":
            return self._forward_pl_core(x)
        return self._forward_lp_core(x)

版本 3：在版本 2 基础上，再加一个 gate：在 bilinear 和 DySample 之间自适应融合

输出：y = y_bilinear + gate * (y_dynamic - y_bilinear)
gate 从低分辨率 x 上算出来，插值到高分辨率。平坦区域更偏向 bilinear，边缘区域更偏向动态采样。

python
class DySample(nn.Module):
    """
    版本3：版本2 + gate (bilinear vs dynamic 自适应融合)
    - YOLO yaml: DySample, [2, "lp"]
    - groups = 4 固定
    - learnable static scope alpha
    - DLU 风格内容编码器
    - gate: y = y_bi + g * (y_dyn - y_bi)
    """

    def __init__(self, in_channels, scale=2, style="lp"):
        super().__init__()
        self.scale = scale
        self.style = style
        self.groups = 4

        assert style in ["lp", "pl"]
        if style == "pl":
            assert in_channels >= scale ** 2 and in_channels % (scale ** 2) == 0
        assert in_channels >= self.groups and in_channels % self.groups == 0

        # 内容编码器
        mid_channels = max(in_channels // 2, 16)
        self.compress = nn.Conv2d(in_channels, mid_channels, 1)
        self.encoder = nn.Conv2d(
            mid_channels,
            in_channels,
            kernel_size=3,
            padding=1,
            dilation=1,
            bias=True,
        )

        # offset 卷积
        if style == "pl":
            conv_in_channels = in_channels // (scale ** 2)
            out_channels = 2 * self.groups
        else:
            conv_in_channels = in_channels
            out_channels = 2 * self.groups * (scale ** 2)

        self.offset = nn.Conv2d(conv_in_channels, out_channels, 1)
        normal_init(self.offset, std=0.001)

        # learnable alpha
        init_alpha = 0.25
        logit = math.log(init_alpha / (0.5 - init_alpha))
        self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float))

        # gate 分支
        hidden = max(in_channels // 4, 8)
        self.gate_conv1 = nn.Conv2d(in_channels, hidden, 1)
        self.gate_conv2 = nn.Conv2d(hidden, 1, 1)

        self.register_buffer("init_pos", self._init_pos())

    # === 原版 _init_pos 和 sample ===

    def _init_pos(self):
        h = torch.arange(
            (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1
        ) / self.scale
        pos = torch.stack(torch.meshgrid([h, h])).transpose(1, 2)
        pos = pos.repeat(1, self.groups, 1).reshape(1, -1, 1, 1)
        return pos

    def sample(self, x, offset):
        B, _, H, W = offset.shape
        offset = offset.view(B, 2, -1, H, W)

        coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5
        coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5

        coords = torch.stack(torch.meshgrid([coords_w, coords_h]))
        coords = (
            coords.transpose(1, 2)
            .unsqueeze(1)
            .unsqueeze(0)
            .to(device=x.device, dtype=x.dtype)
        )

        normalizer = torch.tensor(
            [W, H], dtype=x.dtype, device=x.device
        ).view(1, 2, 1, 1, 1)

        coords = 2 * (coords + offset) / normalizer - 1

        coords = F.pixel_shuffle(
            coords.view(B, -1, H, W), self.scale
        ).view(
            B, 2, -1, self.scale * H, self.scale * W
        ).permute(
            0, 2, 3, 4, 1
        ).contiguous().flatten(
            0, 1
        )

        return F.grid_sample(
            x.reshape(B * self.groups, -1, H, W),
            coords,
            mode="bilinear",
            align_corners=False,
            padding_mode="border",
        ).view(B, -1, self.scale * H, self.scale * W)

    # ==============================

    def _get_alpha(self):
        return 0.5 * torch.sigmoid(self.logit_alpha)

    def encode(self, x):
        x1 = self.compress(x)
        x1 = F.relu(x1, inplace=True)
        x1 = self.encoder(x1)
        return x1

    def _forward_lp_core(self, x):
        x_enc = self.encode(x)
        alpha = self._get_alpha()
        offset = self.offset(x_enc) * alpha + self.init_pos.to(x.dtype)
        return self.sample(x, offset)

    def _forward_pl_core(self, x):
        x_enc = self.encode(x)
        x_ = F.pixel_shuffle(x_enc, self.scale)
        alpha = self._get_alpha()
        offset = (
            F.pixel_unshuffle(self.offset(x_), self.scale) * alpha
            + self.init_pos.to(x.dtype)
        )
        return self.sample(x, offset)

    def forward(self, x):
        # 动态分支
        if self.style == "pl":
            y_dyn = self._forward_pl_core(x)
        else:
            y_dyn = self._forward_lp_core(x)

        # 双线性基线
        y_bi = F.interpolate(
            x, scale_factor=self.scale, mode="bilinear", align_corners=False
        )

        # gate from x: [B,1,H,W] -> 上采样到 [B,1,sH,sW]
        g = self.gate_conv1(x)
        g = F.relu(g, inplace=True)
        g = self.gate_conv2(g)
        g = torch.sigmoid(g)  # [B,1,H,W]

        g_up = F.interpolate(
            g, scale_factor=self.scale, mode="bilinear", align_corners=False
        )  # [B,1,sH,sW]

        # 残差融合：g=0 → 纯 bilinear，g=1 → 纯 DySample
        y = y_bi + g_up * (y_dyn - y_bi)
        return y

接下来建议你这样试

先用版本 1，确保不再报 shape 错误，看看指标（特别是 mAP50-95）比你之前的 lp + groups=4 + dyscope=False 有没有一点提升。
如果版本 1 正常，再换成版本 2、版本 3 各训一轮，对比 mAP50 / mAP50-95 的变化。

如果你换了某个版本之后又出现新的错误（比如 shape 不一致），把那一版的 DySample 全部代码 + 报错栈贴出来，我可以帮你把维度再对一遍（但只要 _init_pos 和 sample 按上面写，基本不会再出现 “2 vs 16” 这种问题了）。

Answer 7

是一样的 ✅，你这个写法和我给你的那版，在形状和数值上都等价，不会影响结果。

咱们仔细对比一下这两种写法做了什么：

你的写法

python
grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij")  # (W, H)
coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2)       # (2, H, W)

# 关键：变成 [1, 2, 1, H, W]
coords = coords.unsqueeze(0).unsqueeze(2)  # (1, 2, 1, H, W)

维度变化过程：

grid_w, grid_h: 形状都是 (W, H)
stack(..., dim=0) → (2, W, H)
transpose(1, 2) → (2, H, W)
unsqueeze(0) → (1, 2, H, W)
unsqueeze(2) → (1, 2, 1, H, W)

最后是 (1, 2, 1, H, W) ✅

我写的那版

python
coords = torch.stack(torch.meshgrid([coords_w, coords_h]))  # (2, W, H)
coords = coords.transpose(1, 2)                              # (2, H, W)
coords = coords.unsqueeze(1).unsqueeze(0)                    # (1, 2, 1, H, W)

维度变化过程：

stack(meshgrid(...)) → (2, W, H)
transpose(1, 2) → (2, H, W)
unsqueeze(1) → (2, 1, H, W)
unsqueeze(0) → (1, 2, 1, H, W)

最后也是 (1, 2, 1, H, W) ✅

结论：两种写法完全等价

最终形状相同：都是 (1, 2, 1, H, W)，正好可以和 offset 的 (B, 2, G, H, W) 在第 0 和第 2 维广播：
- 维 0：1 vs B → 广播
- 维 1：2 vs 2 → 匹配
- 维 2：1 vs G → 广播
- 维 3：H vs H → 匹配
- 维 4：W vs W → 匹配
unsqueeze(0).unsqueeze(2) 和 unsqueeze(1).unsqueeze(0) 只是“从不同顺序插入维度”，但最后的维度序列都是 [1, 2, 1, H, W]，数据本身没变。
你用 meshgrid(..., indexing="ij") 只是修掉 PyTorch 的 warning，这个和我不用 indexing、直接默认 'ij' 在行为上是一致的（前提是维度顺序一样）。

所以，如果你现在这一版：

python
grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij")
coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2)
coords = coords.unsqueeze(0).unsqueeze(2)

已经不再报之前那个：

text
RuntimeError: The size of tensor a (2) must match the size of tensor b (16) at non-singleton dimension 2

那就可以放心用，和我给你的那版在数学上是一样的，不会影响 DySample 的行为。

如果你之后还想加别的改动（比如 gate 或 encoder），只要保证 coords 最终是 [1, 2, 1, H, W]，offset 是 [B, 2, G, H, W]，就不会再遇到这个维度冲突问题了。

Answer 8

结论先说在前面：后续主线建议基于「方案 1」继续改，方案 3 保留当对比 / 做 P–R trade-off 用就好。

1. 先把三个版本的结果摆在一起

你现在的几个结果（我给它们起个名字方便说）：

🟢 原版 DySample（lp, groups=4, dyscope=False）
- P = 0.947
- R = 0.882
- mAP50 = 0.901
- mAP50–95 = 0.720
🔵 方案 1（static α 可学习）
- P = 0.940（略降）
- R = 0.830（明显下降）
- mAP50 = 0.909（↑）
- mAP50–95 = 0.725（↑ 最多）
🟥 方案 3（encoder + gate）
- P = 0.961（大幅 ↑）
- R = 0.836（下降）
- mAP50 = 0.898（略降）
- mAP50–95 = 0.721（几乎没变）

可以看到：

综合指标（mAP50 / mAP50–95）最好的是方案 1
最高精度的是方案 3，但 mAP 没超过原版，属于“好看但不赚分”型
方案 2 你说表现最差，已经可以放弃了

在 COCO 等检测任务里，评价最终看的是 mAP50–95 为主，P/R 更像是“侧面性格”。
从这个角度看：

方案 1：mAP50–95 提升最多（+0.005），说明整体 PR 曲线形状是更好的
方案 3：虽然 P 很高，但 mAP50 降了、mAP50–95 基本没动，说明 gate 目前更多是在“调分布”，不是在真正提升整体质量

2. 为什么建议你选「方案 1」作为后续主线？

结合 DySample 论文本身的设计和消融实验：

原论文里最关键的两个点是：
1. bilinear 初始化 + static scope factor 0.25（控制 offset 范围，避免采样点重叠带来的边界错误）(Learning to Upsample by Learning to Sample.pdf)
2. 分组（g=4） 提高表达能力但不增加太多计算

你现在的方案 1 做的事情是：

保留了论文里的 LP + static scope 的主体结构
只是在 0.25 外面加了一个 可学习 α，让网络自己在 ~0.25 附近微调

从实验结果来看：

它确实 把 mAP50 / mAP50–95 都往上抬了一点
虽然召回掉得有点多，但整体 mAP 曲线是提升的，这是个“有明确收益”的修改

相比之下：

方案 3 虽然通过 gate 把 精度拉得很高，但
- 召回下去了
- mAP50–95 基本没赚分
- 结构更复杂，参数更多，调参空间也更大（同时也更难稳定）
对于你现在这个阶段，更推荐先把简单的 DySample 变种榨干，再考虑复杂 gate 版

所以我的建议是：

✅ 继续基于「方案 1」做细化和小改进，把它当作主干版本。
⭕ 方案 3 可以保留用来研究 “高 P / 低 R” 风格，或者以后做 ensemble / 测试不同任务时再启用。

3. 下一步可以怎么在方案 1 上微调（预告一下）

你后面如果要继续改方案 1，可以重点围绕几块玩——都在 不改 yaml 接口 的前提下：

约束 α 不要跑太远
- 当前 α 学习范围是 (0, 0.5)，可以：
  - 在 loss 里对 (α - 0.25)^2 加一个很小的正则
  - 或者把范围改成稍窄一点，比如 (0.15, 0.35)，让它只在理论合适区间内细调
只对某一层 DySample 用 encoder / gate 做局部增强（比如只在高层特征上用）
- 这样既不破坏你现在整体的收敛行为，又能在局部场景（大目标 / 复杂边缘）上试试收益
针对召回下降的问题，微调 NMS / conf 阈值
- 通常动态上采样会让特征更“干净”，有时候默认阈值对它来说太严格，
- 适当降低 conf / 调整 NMS IoU，有可能把 R 拉回来而不伤 mAP 太多

如果你接下来想继续改代码，我们可以在 方案 1 的实现上，一点点加这些小改动，而不动 yaml。

一句话总结：

从 mAP50–95 提升 + 模型复杂度 + 可后续扩展性三个角度看，
👉 优先选「方案 1」作为主线继续优化；
方案 3 先当“侧线实验”，等主线收敛、性能卡住，再考虑把 gate 和 encoder 慢慢引入进来。

Answer 9

好，那我们就把“方案 1”（可学习静态 scope 的 DySample）整理成一套能直接跑的代码给你 👍

你只需要改两块：

DySample 类（比如放在 ultralytics/nn/modules/block.py 里）
parse_model 里 DySample 的那段解析逻辑

yaml 保持不变：

yaml
head:
  - [-1, 1, DySample, [2, "lp"]]
  - [[-1, 6], 1, Concat, [1]]
  - [-1, 2, C3k2, [512, False]]

  - [-1, 1, DySample, [2, "lp"]]
  - [[-1, 4], 1, Concat, [1]]
  - [-1, 2, C3k2, [256, False]]

一、DySample 方案 1 完整实现（可学习静态 scope，LP/PL 都支持）

放在你原来定义 DySample 的地方（比如 ultralytics/nn/modules/block.py），
保证工程里同名类 DySample 只有这一个版本。

python
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


def normal_init(module, mean=0, std=1, bias=0):
    if hasattr(module, "weight") and module.weight is not None:
        nn.init.normal_(module.weight, mean, std)
    if hasattr(module, "bias") and module.bias is not None:
        nn.init.constant_(module.bias, bias)


def constant_init(module, val, bias=0):
    if hasattr(module, "weight") and module.weight is not None:
        nn.init.constant_(module.weight, val)
    if hasattr(module, "bias") and module.bias is not None:
        nn.init.constant_(module.bias, bias)


class DySample(nn.Module):
    """
    方案1：LP/PL + 可学习静态 scope（无 scope 卷积）
    - YOLO yaml: DySample, [2, "lp"]
    - groups 固定为 4
    - alpha ∈ (0, 0.5) 可学习，初始化为 0.25
    """

    def __init__(self, in_channels, scale=2, style="lp"):
        super().__init__()
        self.scale = scale
        self.style = style
        self.groups = 4  # 固定 4 组

        assert style in ["lp", "pl"]
        if style == "pl":
            # PL 风格要求 in_channels 可被 scale^2 整除
            assert in_channels >= scale ** 2 and in_channels % (scale ** 2) == 0
        # 分组也要整除
        assert in_channels >= self.groups and in_channels % self.groups == 0

        # 决定 offset 卷积的输入输出通道
        if style == "pl":
            # PL 风格：conv 输入通道 = C / s^2，输出通道 = 2 * groups
            conv_in_channels = in_channels // (scale ** 2)
            out_channels = 2 * self.groups
        else:
            # LP 风格：conv 输入通道 = C，输出通道 = 2 * groups * s^2
            conv_in_channels = in_channels
            out_channels = 2 * self.groups * (scale ** 2)

        # offset 卷积
        self.offset = nn.Conv2d(conv_in_channels, out_channels, 1)
        normal_init(self.offset, std=0.001)

        # 可学习的静态 scope 因子 alpha ∈ (0, 0.5)，初始化成 0.25
        init_alpha = 0.25
        logit = math.log(init_alpha / (0.5 - init_alpha))  # 反 sigmoid 映射
        self.logit_alpha = nn.Parameter(torch.tensor(logit, dtype=torch.float))

        # 初始采样位置（论文原版实现）
        self.register_buffer("init_pos", self._init_pos())

    # ------------ 下面两个函数保持官方实现的形状逻辑 ------------

    def _init_pos(self):
        """
        返回 init_pos: [1, 2 * groups * scale^2, 1, 1]
        """
        # h: [-(s-1)/2, ..., (s-1)/2] / s
        h = torch.arange(
            (-self.scale + 1) / 2, (self.scale - 1) / 2 + 1
        ) / self.scale  # (s,)

        # meshgrid: (s, s)，加 indexing="ij" 避免警告
        yy, xx = torch.meshgrid(h, h, indexing="ij")  # (s, s)

        # stack 后 (2, s, s)，然后在 dim=1 上 repeat groups 次
        pos = torch.stack([xx, yy], dim=0)           # (2, s, s)
        pos = pos.transpose(1, 2).repeat(1, self.groups, 1)  # (2, s*groups, s)
        pos = pos.reshape(1, -1, 1, 1)  # (1, 2 * groups * s^2, 1, 1)
        return pos

    def sample(self, x, offset):
        """
        x:      [B, C, H, W]
        offset: [B, 2*groups*scale^2, H, W]
        """
        B, _, H, W = offset.shape
        offset = offset.view(B, 2, -1, H, W)  # [B, 2, G, H, W], G = groups * scale^2

        # 构造基础坐标 [1, 2, 1, H, W]
        coords_h = torch.arange(H, device=x.device, dtype=x.dtype) + 0.5
        coords_w = torch.arange(W, device=x.device, dtype=x.dtype) + 0.5
        grid_w, grid_h = torch.meshgrid(coords_w, coords_h, indexing="ij")  # (W, H)
        coords = torch.stack([grid_w, grid_h], dim=0).transpose(1, 2)       # (2, H, W)
        coords = coords.unsqueeze(0).unsqueeze(2)                            # (1, 2, 1, H, W)

        # 归一化系数
        normalizer = torch.tensor(
            [W, H], dtype=x.dtype, device=x.device
        ).view(1, 2, 1, 1, 1)

        # coords 和 offset 广播相加，结果形状 [B, 2, G, H, W]
        coords = 2 * (coords + offset) / normalizer - 1

        # 展开到高分辨率采样点：
        # 先 view 成 [B, 2*G, H, W]，再 pixel_shuffle(r=scale)，得到 [B, 2*G/r^2, sH, sW] = [B, 2*groups, sH, sW]
        coords = F.pixel_shuffle(
            coords.view(B, -1, H, W), self.scale
        ).view(
            B, 2, -1, self.scale * H, self.scale * W
        ).permute(
            0, 2, 3, 4, 1
        ).contiguous().flatten(
            0, 1
        )  # [B*groups, sH, sW, 2]

        # 对每个 group 做 grid_sample
        return F.grid_sample(
            x.reshape(B * self.groups, -1, H, W),
            coords,
            mode="bilinear",
            align_corners=False,
            padding_mode="border",
        ).view(B, -1, self.scale * H, self.scale * W)

    # --------------------------------------------------

    def _get_alpha(self):
        """
        alpha ∈ (0, 0.5)
        """
        return 0.5 * torch.sigmoid(self.logit_alpha)

    def forward_lp(self, x):
        """
        LP 风格：offset = Conv(x)，直接在低分辨率上算 offset
        """
        alpha = self._get_alpha()
        offset = self.offset(x) * alpha + self.init_pos.to(x.dtype)
        return self.sample(x, offset)

    def forward_pl(self, x):
        """
        PL 风格：先 pixel_shuffle 到高分辨率，再 conv，再 pixel_unshuffle 回低分辨率 offset
        """
        x_ = F.pixel_shuffle(x, self.scale)
        alpha = self._get_alpha()
        offset = (
            F.pixel_unshuffle(self.offset(x_), self.scale) * alpha
            + self.init_pos.to(x.dtype)
        )
        return self.sample(x, offset)

    def forward(self, x):
        if self.style == "pl":
            return self.forward_pl(x)
        return self.forward_lp(x)

二、`parse_model` 里 DySample 的解析代码

关键点有两个：

不要把 DySample 放进 base_modules，否则会走通用卷积那套逻辑，导致 c2 / args 搞错。
专门给 DySample 加一个分支，只传 [in_channels, scale, style] 这三个参数。

1）`base_modules` 里删掉 DySample

你原来是这样的（节选）：

python
base_modules = frozenset(
    {
        Classify,
        Conv,
        ...
        LSKBlock,
        DySample,   # ❌ 不要放这里
        MSCA,
    }
)

改成：

python
base_modules = frozenset(
    {
        Classify,
        Conv,
        CBAM,
        ConvTranspose,
        GhostConv,
        Bottleneck,
        GhostBottleneck,
        SPP,
        SPPF,
        C2fPSA,
        C2PSA,
        DWConv,
        Focus,
        BottleneckCSP,
        C1,
        C2,
        C2f,
        C3k2,
        RepNCSPELAN4,
        ELAN1,
        ADown,
        AConv,
        SPPELAN,
        C2fAttn,
        C3,
        C3TR,
        C3Ghost,
        torch.nn.ConvTranspose2d,
        DWConvTranspose2d,
        C3x,
        RepC3,
        PSA,
        SCDown,
        C2fCIB,
        A2C2f,
        LSKBlock,
        MSCA,
        # ⬆️ 这里不要写 DySample
    }
)

2）for 循环里加 DySample 专用分支

在你原来的 for i, (f, n, m, args) in enumerate(...): 大循环里，
留意你之前的结构，大概是这样（节选）：

python
    for i, (f, n, m, args) in enumerate(d["backbone"] + d["head"]):
        m = ...
        ...
        if m in base_modules:
            ...
        elif m is AIFI:
            ...
        ...
        elif m in frozenset({TorchVision, Index}):
            ...
        elif m is DySample:
            c2 = ch[f]
            args = [ch[f], *args]
        else:
            c2 = ch[f]

我们把 DySample 分支改成带 scale/style 解析的版本：

python
        ...
        elif m in frozenset({TorchVision, Index}):
            c2 = args[0]
            c1 = ch[f]
            args = [*args[1:]]
        elif m is DySample:
            # DySample 不改变通道数：c2 = c1
            c1 = ch[f]
            c2 = c1

            # yaml 给的是 [2, "lp"]，只解析这两个参数
            scale = args[0] if len(args) > 0 else 2
            style = args[1] if len(args) > 1 else "lp"

            # DySample(in_channels, scale, style)
            args = [c1, scale, style]
        else:
            c2 = ch[f]

这样：

yaml 里的 [2, "lp"] 会被扩充成 [in_channels, 2, "lp"]
和我们上面实现的 DySample.__init__(self, in_channels, scale=2, style='lp') 完全对应
DySample 输出通道数 c2 会被正确记成 ch[f]，后面 Concat/Detect 都能正常推断通道数

三、你接下来可以直接做的事

把你工程里原来的 DySample 替换成上面“方案 1”的这个版本。
确认 parse_model 里：
- base_modules 没有 DySample；
- 有上面那段 elif m is DySample: 的分支。
yaml 保持你现在用的那一份不变。

这样启动 python train.py 时：

模型构建阶段不会再出现那个 2 vs 16 的 shape 报错；
stride 推理阶段也能正常跑完；
指标表现就是你前面测到的那版：在原有 lp, groups=4, dyscope=False 的基础上，mAP50 和 mAP50–95 都有小提升。

如果你按这个版本跑完，又想在这个基础上加小改动（比如约束 α 的范围、只给某一层上 encoder/gate），可以把新的报错或结果再贴出来，我们可以继续在这个“方案 1”版本上迭代。

질문

사고

Comparing DySample and DLU upsampling methods

Comparing DySample and DLU on upsampling for YOLO FPN

Comparing DLU and DySample for dynamic upsampling

Comparing DLU and DySample for YOLO

Integrating DLU into YOLOv11

Modifying YOLO YAML for custom modules

Explaining model choice: static vs. dynamic

Comparing DySample and DLU dynamic upsamples

Recommending DySample for YOLO implementation

Supporting DySample integration into YOLO

Comparing DySample and DLU for YOLO Integration

답변

1. DySample vs DLU：谁更轻，谁效果更好，谁更新？

1.1 参数量 & 复杂度（以 2× 上采样、C=256 为例）

1.2 性能（从论文层面看）

1.3 哪个更“新”？

2. 你说的 DySample 静态/动态模型现象怎么理解？

3. 你贴的 DLU 代码是不是对的？yaml 能不能直接插入？

3.1 这是不是 DLU 的代码？

3.2 yaml 能不能直接插入这段代码？

4. 建议：在 YOLOv11 里怎么优雅地替换上采样（以 DySample 为例）

4.1 在代码里定义模块

4.2 在 yaml 里替换原来的双线性上采样

4.3 训练上的注意点

5. 如果你“非要”上 DLU，要注意什么？

一句话总结给你：

질문

Ultralytics YOLO11 object detection model with P3/8 - P5/32 outputs

Model docs: https://docs.ultralytics.com/models/yolo11

Task docs: https://docs.ultralytics.com/tasks/detect

Parameters

[depth, width, max_channels]

YOLO11n backbone

[from, repeats, module, args]

- [-1, 1, MSCA, [1024]]

YOLO11n head

- [-1, 1, nn.Upsample, [None, 2, "nearest"]]

- [-1, 1, nn.Upsample, [None, 2, "nearest"]]

- [[-1, 13], 1, Concat, [1]]

- [[-1, 10], 1, Concat, [1]]

- [[16, 19, 22], 1, Detect, [nc]]

질문

Ultralytics YOLO11 object detection model with P3/8 - P5/32 outputs

Model docs: https://docs.ultralytics.com/models/yolo11

Task docs: https://docs.ultralytics.com/tasks/detect

Parameters

[depth, width, max_channels]

YOLO11n backbone

[from, repeats, module, args]

- [-1, 1, MSCA, [1024]]

YOLO11n head

- [-1, 1, nn.Upsample, [None, 2, "nearest"]]

- [-1, 1, nn.Upsample, [None, 2, "nearest"]]

- [[-1, 13], 1, Concat, [1]]

- [[-1, 10], 1, Concat, [1]]

- [[16, 19, 22], 1, Detect, [nc]]

사고

Debugging YOLOv11 DySample integration

Refining DySample module channel handling

Investigating DySample class TypeError

Identifying potential source of None issue

Figuring out where max_channels becomes None

Identifying where None comes from in the model

Understanding module errors and DySample in base modules

Explaining errors and miscounting layer indices

Clarifying indices and error with detection output

Breaking down feature operations and potential issue with concat

Analyzing adjustments in path and module handling

Analyzing DySample integration and TypeError cause

Fixing DySample scale mismatch and IndexError

Fixing DySample integration and parse_model issues

Providing clear fix for errors when modifying parse_model with DySample

답변

一、两个报错分别是什么原因？

1）TypeError: '<' not supported between instances of 'int' and 'NoneType'

2）IndexError: list index out of range（出现在 Detect 上）

二、DySample 在 parse_model 里的正确写法

1）从 base_modules 里删掉 DySample

1）`TypeError: '<' not supported between instances of 'int' and 'NoneType'`

2）`IndexError: list index out of range`（出现在 Detect 上）

1）从 `base_modules` 里删掉 `DySample`

1.1 `style='lp'`：Linear + PixelShuffle（LP 风格）

1.2 `style='pl'`：PixelShuffle + Linear（PL 风格）

3.2 什么时候考虑 `style='pl'`（PL 风格）