视频处理优化建议

Question

import cv2
import numpy as np
import time
import pygame
from OpenGL.GL import *
from OpenGL.GLUT import *
from OpenGL.GLU import *
from pygame.locals import *
from tqdm import tqdm

设置 OpenGL 显示窗口

def init_gl(width, height):
pygame.display.set_mode((width, height), DOUBLEBUF | OPENGL)
glViewport(0, 0, width, height)
glMatrixMode(GL_PROJECTION)
glLoadIdentity()
gluOrtho2D(0, width, 0, height)
glMatrixMode(GL_MODELVIEW)
glEnable(GL_BLEND)
glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA)

加载视频帧

def load_video_frames(video_path):
cap = cv2.VideoCapture(video_path)
frames = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frames.append(frame)
cap.release()
return frames

绘制带透明度的视频帧

def render_frame(frame, x, y, width, height, alpha):
# 转换BGR到RGBA
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGBA)
frame = cv2.resize(frame, (width, height))

text
# 应用透明度
frame[:, :, 3] = (frame[:, :, 3] * alpha).astype(np.uint8)

# 创建纹理
texture_id = glGenTextures(1)
glBindTexture(GL_TEXTURE_2D, texture_id)
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR)
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR)
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, frame)

# 绘制纹理
glLoadIdentity()
glTranslatef(x, y, 0)
glBegin(GL_QUADS)
glTexCoord2f(0, 0)
glVertex2f(0, 0)
glTexCoord2f(1, 0)
glVertex2f(width, 0)
glTexCoord2f(1, 1)
glVertex2f(width, height)
glTexCoord2f(0, 1)
glVertex2f(0, height)
glEnd()
glDeleteTextures([texture_id])

将视频帧嵌入到另一个视频中

def process_video(background_video_path, overlay_video_path, output_video_path, x, y, width, height, alpha, frame_rate=30):
# 加载背景和叠加视频帧
background_frames = load_video_frames(background_video_path)
overlay_frames = load_video_frames(overlay_video_path)

text
# 输出视频设置
fourcc = cv2.VideoWriter_fourcc(*'MP4V')
output_video = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (background_frames[0].shape[1], background_frames[0].shape[0]))

# 初始化 pygame 和 OpenGL
pygame.init()
init_gl(background_frames[0].shape[1], background_frames[0].shape[0])

start_time = time.time()

# 处理每一帧
for i in tqdm(range(min(len(background_frames), len(overlay_frames))), desc="Processing video frames", unit="frame"):
    glClear(GL_COLOR_BUFFER_BIT)

    # 背景视频
    render_frame(background_frames[i], 0, 0, background_frames[i].shape[1], background_frames[i].shape[0], 1.0)

    # 叠加视频
    render_frame(overlay_frames[i], x, y, width, height, alpha)

    # 刷新并保存当前帧
    pygame.display.flip()
    pygame.image.save(pygame.display.get_surface(), "temp_frame.png")  # 保存为临时图像
    temp_frame = cv2.imread("temp_frame.png")
    output_video.write(temp_frame)

end_time = time.time()
total_time = end_time - start_time
print(f"视频处理完成，耗时: {total_time:.2f}秒")

output_video.release()
pygame.quit()

if name == "main":
# 输入视频路径
background_video_path = '/mnt/lgs/Open-Sora/moviepys/input/291514.mp4' # 背景视频
overlay_video_path = '/mnt/lgs/Open-Sora/moviepys/input/17095_1727158011_raw.mp4' # 叠加视频
output_video_path = '/mnt/lgs/Open-Sora/gstreamers/output/output_video.mp4' # 输出视频

text
# 透明度和位置调整
alpha = 0.5  # 透明度 (0.0 - 1.0)
x, y = 100, 100  # 叠加视频位置
width, height = 640, 360  # 叠加视频大小

process_video(background_video_path, overlay_video_path, output_video_path, x, y, width, height, alpha)

opengl_venvroot@ecs-60954587:/mnt/lgs/Open-Sora# /mnt/lgs/Open-Sora/opengl_venv/bin/python /mnt/lgs/Open-Sora/gstreamers/video_overlay.py
pygame 2.6.0 (SDL 2.28.4, Python 3.10.6)
Hello from the pygame community. https://www.pygame.org/contribute.html
OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'
ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default
Processing video frames: 100%|███████████████████████████████████████████████████████████████████████████████| 566/566 [01:07<00:00, 8.41frame/s]
视频处理完成，耗时: 67.33秒

太慢了， 8.41frame/s

Answer 1

You're experiencing slow performance because the current approach involves rendering each frame with OpenGL and Pygame, saving it as a temporary PNG image, and then reading it back with OpenCV to write to the output video. This method introduces significant overhead due to disk I/O operations and the computational cost of rendering each frame individually.

To achieve faster processing, you can streamline your workflow by leveraging OpenCV's efficient frame manipulation capabilities directly, eliminating the need for OpenGL and Pygame entirely. OpenCV is highly optimized for video processing tasks and can handle frame blending much more efficiently.

Here's how you can optimize your script:

Use OpenCV for Frame Overlaying: OpenCV can handle alpha blending natively, which allows you to overlay one video onto another efficiently without the need for external rendering libraries.
Process Frames Sequentially: Instead of loading all frames into memory at once, process each frame sequentially. This reduces memory usage and speeds up processing.
Avoid Disk I/O for Temporary Frames: By performing all operations in memory, you eliminate the need for saving and reading temporary image files, which significantly reduces processing time.
Optimize Video Writing: Ensure that you're using an efficient codec and that the video writer is properly configured for your output format.

Below is the optimized script using OpenCV only:

python
import cv2
import numpy as np
from tqdm import tqdm

def process_video(background_video_path, overlay_video_path, output_video_path, x, y, width, height, alpha, frame_rate=30):
    # Open background and overlay videos
    background_cap = cv2.VideoCapture(background_video_path)
    overlay_cap = cv2.VideoCapture(overlay_video_path)

    if not background_cap.isOpened():
        print(f"Error opening background video: {background_video_path}")
        return
    if not overlay_cap.isOpened():
        print(f"Error opening overlay video: {overlay_video_path}")
        return

    # Get background video properties
    bg_width = int(background_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    bg_height = int(background_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    bg_fps = background_cap.get(cv2.CAP_PROP_FPS)
    bg_total_frames = int(background_cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Get overlay video properties
    ov_fps = overlay_cap.get(cv2.CAP_PROP_FPS)
    ov_total_frames = int(overlay_cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Set frame rate to min of both videos if not specified
    if frame_rate is None:
        frame_rate = min(bg_fps, ov_fps)
    
    # Initialize VideoWriter
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # You can choose other codecs like 'XVID' if needed
    output_video = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (bg_width, bg_height))

    # Calculate total frames to process based on background video
    total_frames = bg_total_frames

    # Optional: Adjust overlay video frame rate to match background
    if ov_fps != frame_rate:
        overlay_frame_step = int(round(ov_fps / frame_rate))
    else:
        overlay_frame_step = 1

    # Process frames
    with tqdm(total=total_frames, desc="Processing video frames", unit="frame") as pbar:
        for i in range(total_frames):
            ret_bg, frame_bg = background_cap.read()
            if not ret_bg:
                break  # End of background video

            # Determine corresponding overlay frame
            overlay_frame_idx = i * overlay_frame_step
            overlay_cap.set(cv2.CAP_PROP_POS_FRAMES, overlay_frame_idx)
            ret_overlay, frame_overlay = overlay_cap.read()

            if not ret_overlay:
                # Optionally loop the overlay video or use the last frame
                overlay_cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
                ret_overlay, frame_overlay = overlay_cap.read()
                if not ret_overlay:
                    print("Error reading overlay video.")
                    break

            # Resize overlay frame
            frame_overlay_resized = cv2.resize(frame_overlay, (width, height))

            # Ensure the overlay fits within the background frame
            if y + height > bg_height or x + width > bg_width:
                print("Overlay frame exceeds background frame dimensions.")
                break

            # Create ROI on background frame
            roi = frame_bg[y:y+height, x:x+width]

            # Convert overlay to float and normalize alpha
            overlay = frame_overlay_resized.astype(float)
            background = roi.astype(float)

            # Blend the overlay with the background ROI
            blended = cv2.addWeighted(overlay, alpha, background, 1 - alpha, 0)

            # Replace the ROI on the background frame with the blended result
            frame_bg[y:y+height, x:x+width] = blended.astype(np.uint8)

            # Write the frame to the output video
            output_video.write(frame_bg)

            pbar.update(1)

    # Release resources
    background_cap.release()
    overlay_cap.release()
    output_video.release()
    print("视频处理完成。")

if __name__ == "__main__":
    # 输入视频路径
    background_video_path = '/mnt/lgs/Open-Sora/moviepys/input/291514.mp4'  # 背景视频
    overlay_video_path = '/mnt/lgs/Open-Sora/moviepys/input/17095_1727158011_raw.mp4'  # 叠加视频
    output_video_path = '/mnt/lgs/Open-Sora/gstreamers/output/output_video.mp4'  # 输出视频

    # 透明度和位置调整
    alpha = 0.5  # 透明度 (0.0 - 1.0)
    x, y = 100, 100  # 叠加视频位置
    width, height = 640, 360  # 叠加视频大小

    process_video(background_video_path, overlay_video_path, output_video_path, x, y, width, height, alpha)

Key Improvements Explained

Eliminated Pygame and OpenGL:
- By removing Pygame and OpenGL, you eliminate the overhead associated with initializing these libraries and rendering frames through OpenGL.
Frame-by-Frame Processing with OpenCV:
- The script now reads and processes each frame sequentially using OpenCV's efficient frame handling capabilities.
Direct Frame Blending:
- The cv2.addWeighted function efficiently blends the overlay frame with the background frame using the specified alpha value.
Avoiding Disk I/O for Temporary Frames:
- All operations are performed in memory without saving intermediate frames to disk, which drastically reduces processing time.
Handling Different Frame Rates:
- The script adjusts the overlay frame rate to match the background video frame rate to ensure smooth overlaying.
Progress Tracking with tqdm:
- Provides a real-time progress bar to monitor the processing status.

Additional Optimizations

Preallocate VideoWriter:
- Ensure that the VideoWriter is correctly initialized with the right codec and frame size to prevent runtime adjustments that can slow down processing.
Avoid Repeated Resizing:
- If the overlay video is already at the desired size, you can skip the resizing step to save computational resources.
Looping Overlay Video:
- If the overlay video is shorter than the background, the script loops the overlay video to match the background's length, ensuring consistent overlay without stopping prematurely.
GPU Acceleration:
- For even faster processing, consider using GPU-accelerated libraries like CuPy or OpenCV's CUDA modules, especially if you're dealing with high-resolution videos or require real-time processing.
Multithreading or Multiprocessing:
- Utilize Python's multiprocessing or multithreading modules to parallelize frame processing, taking full advantage of multi-core CPUs.

Benchmarking the Optimized Script

With these optimizations, the processing speed should increase significantly. Here's an approximate expectation based on the improvements:

Previous Speed: ~8.41 frames/second
Optimized Speed: Potentially > 100 frames/second (depending on hardware)

Conclusion

By leveraging OpenCV's efficient frame processing capabilities and eliminating unnecessary rendering steps, you can achieve a substantial increase in processing speed. This approach not only simplifies the workflow but also makes the script more maintainable and scalable for larger video processing tasks.

If you still require specific OpenGL features for more complex rendering, consider integrating GPU-based processing libraries that can handle both rendering and frame manipulation more efficiently. However, for straightforward overlay tasks with transparency, OpenCV alone should suffice and provide the best performance.

Answer 2

您可以通过利用GPU加速来提高视频处理速度。以下是使用OpenCV的CUDA模块和CuPy库来加速处理的完整代码。在这个版本中，我们将计算密集型的操作移到了GPU上，从而提高了处理速度。

注意：要运行以下代码，您需要：

安装支持CUDA的OpenCV（通常需要从源代码编译OpenCV并启用CUDA支持）。
安装CuPy库（pip install cupy-cuda11x，根据您的CUDA版本选择合适的CuPy版本）。
确保您的GPU支持CUDA并已正确安装CUDA驱动程序。

python
import cv2
import numpy as np
import cupy as cp
from tqdm import tqdm

def process_video(background_video_path, overlay_video_path, output_video_path, x, y, width, height, alpha, frame_rate=None):
    # 打开背景和叠加视频
    background_cap = cv2.VideoCapture(background_video_path)
    overlay_cap = cv2.VideoCapture(overlay_video_path)

    if not background_cap.isOpened():
        print(f"无法打开背景视频：{background_video_path}")
        return
    if not overlay_cap.isOpened():
        print(f"无法打开叠加视频：{overlay_video_path}")
        return

    # 获取背景视频属性
    bg_width = int(background_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    bg_height = int(background_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    bg_fps = background_cap.get(cv2.CAP_PROP_FPS)
    bg_total_frames = int(background_cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # 获取叠加视频属性
    ov_fps = overlay_cap.get(cv2.CAP_PROP_FPS)
    ov_total_frames = int(overlay_cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # 如果未指定帧率，则使用背景视频的帧率
    if frame_rate is None:
        frame_rate = bg_fps

    # 初始化VideoWriter
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    output_video = cv2.VideoWriter(output_video_path, fourcc, frame_rate, (bg_width, bg_height))

    # 计算要处理的总帧数
    total_frames = min(bg_total_frames, ov_total_frames)

    # 调整叠加视频帧率以匹配背景视频
    ov_frame_interval = ov_fps / frame_rate if ov_fps != frame_rate else 1

    # 预先加载并调整叠加视频帧到GPU内存
    overlay_frames_gpu = []
    print("加载并预处理叠加视频帧...")
    for i in tqdm(range(ov_total_frames), desc="加载叠加视频", unit="frame"):
        ret_overlay, frame_overlay = overlay_cap.read()
        if not ret_overlay:
            break
        frame_overlay_resized = cv2.resize(frame_overlay, (width, height))
        # 将帧转换为GPU数组
        frame_overlay_gpu = cp.asarray(frame_overlay_resized, dtype=cp.float32)
        overlay_frames_gpu.append(frame_overlay_gpu)

    # 重置叠加视频指针
    overlay_cap.release()

    print("开始处理视频帧...")
    # 处理帧
    with tqdm(total=total_frames, desc="处理视频帧", unit="frame") as pbar:
        for i in range(total_frames):
            ret_bg, frame_bg = background_cap.read()
            if not ret_bg:
                break  # 背景视频结束

            # 将背景帧上传到GPU
            frame_bg_gpu = cp.asarray(frame_bg, dtype=cp.float32)

            # 获取对应的叠加帧
            overlay_idx = int(i * ov_fps / bg_fps)
            if overlay_idx >= len(overlay_frames_gpu):
                overlay_idx = overlay_idx % len(overlay_frames_gpu)  # 如果叠加视频较短，则循环播放
            frame_overlay_gpu = overlay_frames_gpu[overlay_idx]

            # 检查叠加帧是否在背景帧范围内
            if y + height > bg_height or x + width > bg_width:
                print("叠加帧超出了背景帧的范围。")
                break

            # 创建背景帧的ROI
            roi_gpu = frame_bg_gpu[y:y+height, x:x+width]

            # 进行加权叠加
            blended_gpu = cp.addWeighted(frame_overlay_gpu, alpha, roi_gpu, 1 - alpha, 0)

            # 将结果复制回背景帧的ROI
            frame_bg_gpu[y:y+height, x:x+width] = blended_gpu

            # 将处理后的帧从GPU复制回CPU内存
            frame_result = cp.asnumpy(frame_bg_gpu).astype(np.uint8)

            # 写入输出视频
            output_video.write(frame_result)

            pbar.update(1)

    # 释放资源
    background_cap.release()
    output_video.release()
    print("视频处理完成。")

if __name__ == "__main__":
    # 输入视频路径
    background_video_path = '/mnt/lgs/Open-Sora/moviepys/input/291514.mp4'  # 背景视频
    overlay_video_path = '/mnt/lgs/Open-Sora/moviepys/input/17095_1727158011_raw.mp4'  # 叠加视频
    output_video_path = '/mnt/lgs/Open-Sora/gstreamers/output/output_video.mp4'  # 输出视频

    # 透明度和位置调整
    alpha = 0.5  # 透明度 (0.0 - 1.0)
    x, y = 100, 100  # 叠加视频位置
    width, height = 640, 360  # 叠加视频大小

    process_video(background_video_path, overlay_video_path, output_video_path, x, y, width, height, alpha)

代码说明

使用CuPy加速计算：
- 我们将所有需要进行大量计算的数组（如视频帧）从NumPy数组转换为CuPy数组，这样这些操作就可以在GPU上执行。
- 使用cp.asarray()将NumPy数组转换为CuPy数组。
- 使用cp.addWeighted()在GPU上执行加权叠加。
预加载叠加视频帧：
- 为了避免在处理过程中重复读取叠加视频，我们预先将所有叠加帧加载到GPU内存中。
- 如果叠加视频很大，可能会导致GPU内存不足。您可以根据实际情况调整，或者在处理过程中逐帧加载。
循环播放叠加视频：
- 如果叠加视频比背景视频短，我们通过取余数的方式循环播放叠加视频。
注意CUDA和CuPy的版本兼容性：
- 确保您的CuPy版本与CUDA版本兼容。例如，如果您使用CUDA 11.x，那么应该安装cupy-cuda11x。

运行前的准备工作

安装支持CUDA的OpenCV：
- OpenCV默认不支持CUDA加速，您需要从源代码编译OpenCV并启用CUDA支持。
- 编译步骤请参考OpenCV的官方文档：OpenCV CUDA Installation
安装CuPy：
- 根据您的CUDA版本安装对应的CuPy版本。例如，对于CUDA 11.1：
```
bash
pip install cupy-cuda111
```
检查CUDA环境：
- 确保CUDA驱动程序和CUDA Toolkit已正确安装，并且您的GPU支持CUDA。

可能的性能改进

避免预加载所有叠加帧：
- 如果GPU内存不足，可以在处理过程中逐帧加载叠加视频，并将其上传到GPU。
批量处理帧：
- 如果可能，可以一次性处理多个帧，以进一步提高效率。
使用更高效的视频编解码器：
- 在初始化VideoWriter时，尝试使用更高效的编码器，如H.264（'H264'），不过需要确保您的OpenCV支持该编码器。
调整线程数：
- 根据您的GPU性能，调整CUDA内核的线程数和块数以获得最佳性能。

注意事项

GPU内存限制：
- 如果叠加视频的分辨率较高且长度较长，预加载所有帧可能会导致GPU内存不足。在这种情况下，建议逐帧读取和处理叠加视频。
环境配置复杂性：
- 使用GPU加速需要额外的环境配置，确保所有库版本兼容，并且CUDA环境正确设置。

结论

通过利用GPU加速，特别是使用CuPy库在GPU上执行数组运算，可以显著提高视频处理速度。请确保按照上述步骤正确设置您的环境，并根据您的硬件配置进行相应的调整。

希望这能帮助您提高视频处理速度！