Tenho o arquivo legenda.psv, que segue essa lógica...

Question

Tenho o arquivo legenda.psv, que segue essa lógica de formato:

...
Q|12.818|12.919
u|12.919|12.960
a|12.960|12.980
n|12.980|13.041
d|13.041|13.102
o|13.102|13.183
|13.183|13.426
e|13.426|13.446
u|13.446|13.649
|13.649|13.669
o|13.669|13.953
l|13.953|13.974
h|13.974|14.014
o|14.014|14.034
|14.034|14.197
p|14.197|14.217
a|14.217|14.420
r|14.420|14.460
a|14.460|14.480
|14.480|14.663
a|14.663|14.683
|14.683|14.703
n|14.703|14.724
a|14.724|14.926
t|14.926|14.947
u|14.947|14.967
r|14.967|14.987
e|14.987|15.007
z|15.007|15.028
a.|15.028|15.068
...

De uma letra Maiuscula até a próxima letra seguida de um . final, é representado como um frase.
Cada letra tem seu respectivo timestamps de start e de end.
Para separar uma palavra da outra, existe uma linha que não tem letra, em vez disso tem um espaço (também com o seu respectivo timestamp start e end).

Com base nessa explicação, crie um script python que cria um video karaoke com base na legenda.psv, onde terá um gradiente que vai passando sobre X frases no video, deverá ser extremamente fluido e preciso.

Deve usar:

import cupy as cp
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import time
import threading
import math
import os
import queue
import subprocess
from tqdm import tqdm
import psutil
import traceback
from cupyx.scipy import ndimage as cupy_ndimage

-- Mande o código completo, e pronto para uso sem erros.

Answer 1

python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Karaokê GPU Gradient Renderer
=============================

Gera um vídeo‐karaokê extremamente fluido e preciso (60 fps, 1280 &times; 720 px)
a partir de um arquivo ``legenda.psv`` no formato:

    letra|timestamp_início|timestamp_fim

onde &laquo; &raquo; (espaço) é o separador de palavras.  
Cada frase começa na primeira letra maiúscula e termina na primeira
letra que contenha &laquo;.&raquo;.

Requisitos
----------
• Python &ge; 3.9 + FFmpeg instalado no *PATH*  
• pip install cupy-cuda11x pillow tqdm psutil

Uso
---
Coloque &laquo;legenda.psv&raquo; e a fonte &laquo;Roboto-Bold.ttf&raquo; na mesma pasta e rode:

    python karaoke_gpu.py

(Opcionalmente edite as variáveis de configuração logo abaixo.)
"""

##############################################################################
# Imports                                                                    #
##############################################################################

import cupy as cp
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import time
import threading
import math
import os
import queue
import subprocess
from tqdm import tqdm
import psutil
import traceback
from cupyx.scipy import ndimage as cupy_ndimage

##############################################################################
# Configurações                                                               #
##############################################################################

LEGENDA_PATH      = "legenda.psv"     # arquivo de entrada
OUTPUT_VIDEO      = "karaoke.mp4"     # vídeo final
FPS               = 60                # frames por segundo
RESOLUTION        = (1280, 720)       # (largura, altura)
FONT_PATH         = "Roboto-Bold.ttf" # caminho da fonte
FONT_SIZE         = 60                # px
PHRASES_ON_SCREEN = 3                 # quantas linhas simultâneas
THREADS           = max(1, os.cpu_count() - 1)
GRAD_BLUR_SIGMA   = 6.0               # suavização do degradê

##############################################################################
# Utilidades                                                                  #
##############################################################################

def parse_legenda(path: str):
    """Lê o .psv e devolve lista de frases com suas letras & tempos."""
    phrases, current_chars = [], []
    with open(path, encoding="utf-8") as f:
        for raw in f:
            raw = raw.strip()
            if not raw:
                continue
            char, start, end = raw.split("|")
            start, end = float(start), float(end)
            if not current_chars:                        # começo de nova frase
                current_chars = []
            current_chars.append({"char": char, "start": start, "end": end})
            if char and char[-1] == ".":                 # fim de frase
                txt = "".join(c["char"] for c in current_chars)
                phrases.append(
                    {"text": txt,
                     "start": current_chars[0]["start"],
                     "end":   current_chars[-1]["end"],
                     "chars": current_chars}
                )
                current_chars = []
    return phrases


def load_font():
    try:
        return ImageFont.truetype(FONT_PATH, FONT_SIZE)
    except OSError:
        # fallback – fonte padrão do Pillow (menos bonita)
        return ImageFont.load_default()


def measure_phrases(phrases, font):
    """Pré-calcula larguras cumulatvas dos caracteres de cada frase."""
    dummy_img  = Image.new("RGB", (8, 8))
    draw       = ImageDraw.Draw(dummy_img)

    for ph in phrases:
        cumul = [0]
        for c in ph["chars"]:
            w = draw.textlength(c["char"], font=font)
            cumul.append(cumul[-1] + w)
        ph["cum_widths"] = cumul            # len == len(chars)+1
        ph["width"]      = cumul[-1]        # px


def make_gradient(width: int, height: int) -> np.ndarray:
    """Gera gradiente horizontal (GPU) em RGB uint8."""
    if width == 0 or height == 0:
        return np.zeros((height, width, 3), dtype=np.uint8)

    u   = cp.linspace(0, 1, width, dtype=cp.float32)[None, :]  # 1&times;w
    r   = cp.full_like(u, 255)
    g   = cp.sin(u * math.pi) * 255     # 0&rarr;255&rarr;0
    b   = cp.zeros_like(u)
    grad= cp.concatenate((r, g, b), axis=0)      # 3&times;w
    grad= cp.tile(grad.T[None, :, :], (height, 1, 1))  # h&times;w&times;3

    if GRAD_BLUR_SIGMA:
        grad = cupy_ndimage.gaussian_filter(grad, sigma=GRAD_BLUR_SIGMA)
    return cp.asnumpy(cp.clip(grad, 0, 255).astype(cp.uint8))


##############################################################################
# Gerador de Frames                                                           #
##############################################################################

class FrameWorker(threading.Thread):
    def __init__(self, tasks_q: queue.Queue, phrases, font):
        super().__init__(daemon=True)
        self.q       = tasks_q
        self.phrases = phrases
        self.font    = font

    def run(self):
        try:
            while True:
                item = self.q.get()
                if item is None:               # poison-pill
                    break
                idx, t_now = item
                self.render_frame(idx, t_now)
                self.q.task_done()
        except Exception:
            traceback.print_exc()
            os._exit(1)

    # ---------------------------------------------------------------------- #
    def render_frame(self, idx: int, t_now: float):
        img       = Image.new("RGB", RESOLUTION, (0, 0, 0))
        draw      = ImageDraw.Draw(img)
        line_h    = FONT_SIZE + 12
        base_y    = RESOLUTION[1] - (PHRASES_ON_SCREEN * line_h) - 30

        visible = [p for p in self.phrases if p["start"] <= t_now]
        visible = visible[-PHRASES_ON_SCREEN:]

        for ln, ph in enumerate(visible):
            y = base_y + ln * line_h

            # centrado horizontalmente
            x = (RESOLUTION[0] - ph["width"]) // 2

            # texto "não cantado" (cinza)
            draw.text((x, y), ph["text"], font=self.font, fill=(90, 90, 90))

            # largura já cantada no tempo atual (char-precision)
            sung_w = 0
            for i, ch in enumerate(ph["chars"]):
                if t_now >= ch["end"]:
                    sung_w = ph["cum_widths"][i + 1]
                    continue
                if ch["start"] <= t_now < ch["end"]:
                    frac   = (t_now - ch["start"]) / (ch["end"] - ch["start"])
                    partial= (ph["cum_widths"][i + 1] - ph["cum_widths"][i]) * frac
                    sung_w = ph["cum_widths"][i] + partial
                break

            sung_w = int(min(max(sung_w, 0), ph["width"]))
            if sung_w == 0:
                continue

            # máscara da frase
            mask = Image.new("L", (ph["width"], FONT_SIZE + 4), 0)
            mdrw = ImageDraw.Draw(mask)
            mdrw.text((0, 0), ph["text"], font=self.font, fill=255)

            # degradê para a parte cantada
            grad = make_gradient(sung_w, mask.height)
            grad = Image.fromarray(grad)
            mask_crop = mask.crop((0, 0, sung_w, mask.height))

            img.paste(grad, (x, y), mask_crop)

        # opcional – exibe uso de RAM a cada 3000 frames
        if idx % 3000 == 0:
            mem = psutil.Process(os.getpid()).memory_info().rss / 2 ** 20
            print(f"[DEBUG] frame {idx} – RAM {mem:.1f} MiB")

        # salva o PNG
        img.save(f"frames/{idx:08d}.png")


##############################################################################
# Pipeline principal                                                          #
##############################################################################

def main():

    # -------------------------------------------------- Preparação
    phrases = parse_legenda(LEGENDA_PATH)
    font    = load_font()
    measure_phrases(phrases, font)

    total_dur   = phrases[-1]["end"]
    total_frames= int(total_dur * FPS) + 1

    os.makedirs("frames", exist_ok=True)

    # -------------------------------------------------- Fila & threads
    task_q = queue.Queue(maxsize=THREADS * 4)
    workers= [FrameWorker(task_q, phrases, font) for _ in range(THREADS)]
    for w in workers:
        w.start()

    # -------------------------------------------------- Enfileira trabalho
    for f in tqdm(range(total_frames), desc="Gerando frames"):
        t_now = f / FPS
        task_q.put((f, t_now))

    # poison pills & espera
    task_q.join()
    for _ in workers:
        task_q.put(None)
    for w in workers:
        w.join()

    # -------------------------------------------------- FFmpeg
    cmd = [
        "ffmpeg", "-y",
        "-framerate", str(FPS),
        "-i", "frames/%08d.png",
        "-c:v", "libx264",
        "-pix_fmt", "yuv420p",
        OUTPUT_VIDEO
    ]
    print("\n[FFmpeg] Renderizando vídeo final…")
    subprocess.run(cmd, check=True)
    print("✅ Karaokê gerado com sucesso ➜", OUTPUT_VIDEO)


if __name__ == "__main__":
    tic = time.time()
    main()
    print(f"⏱️ Tempo total: {time.time() - tic:.1f}s")

Answer 2

python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
GPU Karaokê Renderer v2
=======================

• Corrigido o problema das palavras “grudadas”: agora o parser preserva os
  espaços existentes no legenda.psv.  
• Tamanho da fonte aumentado para melhor leitura.  
• Passa a utilizar a fonte &laquo;Lilita One&raquo; indicada em
  C:/Users/lucas/LilitaOne-Regular.ttf.

Requisitos
----------
Python &ge; 3.9, FFmpeg no PATH e:

    pip install cupy-cuda11x pillow tqdm psutil

Coloque &laquo;legenda.psv&raquo; na mesma pasta do script e execute:

    python karaoke_gpu.py
"""

##############################################################################
# Imports                                                                    #
##############################################################################

import cupy as cp
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import time
import threading
import math
import os
import queue
import subprocess
from tqdm import tqdm
import psutil
import traceback
from cupyx.scipy import ndimage as cupy_ndimage

##############################################################################
# Configurações                                                               #
##############################################################################

LEGENDA_PATH      = "legenda.psv"                         # arquivo de entrada
OUTPUT_VIDEO      = "karaoke.mp4"                         # vídeo final
FPS               = 60                                    # frames por segundo
RESOLUTION        = (1280, 720)                           # (largura, altura)
FONT_PATH         = r"C:/Users/lucas/LilitaOne-Regular.ttf"  # fonte solicitada
FONT_SIZE         = 96                                    # px (maior que antes)
PHRASES_ON_SCREEN = 3                                     # linhas simultâneas
THREADS           = max(1, (os.cpu_count() or 4) - 1)
GRAD_BLUR_SIGMA   = 6.0                                   # suavização do degradê

##############################################################################
# Utilidades                                                                  #
##############################################################################

def parse_legenda(path: str):
    """
    Lê o .psv e devolve lista de frases.
    Preserva espaços — a linha cujo primeiro campo é “ ” é mantida.
    """
    phrases, cur = [], []
    with open(path, encoding="utf-8") as f:
        for raw in f:
            raw = raw.rstrip("\n")          # NÃO strip() &rarr; preserva espaços
            if not raw:
                continue
            try:
                char, start, end = raw.split("|", 2)
                start, end = float(start), float(end)
            except ValueError:
                continue                    # ignora linhas mal-formadas

            if not cur:                     # início de nova frase
                cur = []
            cur.append({"char": char, "start": start, "end": end})

            if char.endswith("."):          # fim da frase
                text = "".join(c["char"] for c in cur)
                phrases.append({
                    "text":  text,
                    "start": cur[0]["start"],
                    "end":   cur[-1]["end"],
                    "chars": cur
                })
                cur = []
    return phrases


def load_font():
    try:
        return ImageFont.truetype(FONT_PATH, FONT_SIZE)
    except OSError:
        print(f"[WARN] Fonte &laquo;{FONT_PATH}&raquo; não encontrada; usando fallback.")
        return ImageFont.load_default()


def measure_phrases(phrases, font):
    """Pré-calcula larguras cumulativas dos caracteres."""
    dummy = Image.new("RGB", (8, 8))
    d     = ImageDraw.Draw(dummy)
    for ph in phrases:
        cumul = [0]
        for c in ph["chars"]:
            cumul.append(cumul[-1] + d.textlength(c["char"], font=font))
        ph["cum_widths"] = cumul            # len == len(chars)+1
        ph["width"]      = cumul[-1]


def make_gradient(width: int, height: int) -> np.ndarray:
    """Gera degradê horizontal RGB em uint8, suavizado na GPU."""
    if width == 0 or height == 0:
        return np.zeros((height, width, 3), dtype=np.uint8)

    lin = cp.linspace(0, 1, width, dtype=cp.float32)[None, :]
    r   = cp.full_like(lin, 255)
    g   = cp.sin(lin * math.pi) * 255       # pico verde ao centro
    b   = cp.zeros_like(lin)
    grad= cp.concatenate((r, g, b), axis=0) # 3&times;w
    grad= cp.tile(grad.T[None, :, :], (height, 1, 1))  # h&times;w&times;3

    if GRAD_BLUR_SIGMA:
        grad = cupy_ndimage.gaussian_filter(grad, sigma=GRAD_BLUR_SIGMA)
    return cp.asnumpy(cp.clip(grad, 0, 255).astype(np.uint8))


##############################################################################
# Gerador de Frames                                                           #
##############################################################################

class FrameWorker(threading.Thread):
    def __init__(self, q: queue.Queue, phrases, font):
        super().__init__(daemon=True)
        self.q, self.phrases, self.font = q, phrases, font

    def run(self):
        try:
            while True:
                job = self.q.get()
                if job is None: break
                idx, t_now = job
                self.render(idx, t_now)
                self.q.task_done()
        except Exception:
            traceback.print_exc()
            os._exit(1)

    # ------------------------------------------------------------------ #
    def render(self, idx: int, t_now: float):
        img    = Image.new("RGB", RESOLUTION, (0, 0, 0))
        draw   = ImageDraw.Draw(img)
        line_h = int(FONT_SIZE * 1.25)
        base_y = RESOLUTION[1] - (PHRASES_ON_SCREEN * line_h) - 40

        visible = [p for p in self.phrases if p["start"] <= t_now][-PHRASES_ON_SCREEN:]

        for ln, ph in enumerate(visible):
            y = base_y + ln * line_h
            x = (RESOLUTION[0] - ph["width"]) // 2

            # Texto ainda não cantado (cinza)
            draw.text((x, y), ph["text"], font=self.font, fill=(120, 120, 120))

            # Calcula largura já cantada
            sung_w = 0
            for i, ch in enumerate(ph["chars"]):
                if t_now >= ch["end"]:
                    sung_w = ph["cum_widths"][i + 1]
                    continue
                if ch["start"] <= t_now < ch["end"]:
                    frac = (t_now - ch["start"]) / (ch["end"] - ch["start"])
                    sung_w = ph["cum_widths"][i] + (ph["cum_widths"][i + 1] - ph["cum_widths"][i]) * frac
                break
            sung_w = int(min(max(sung_w, 0), ph["width"]))
            if sung_w == 0:
                continue

            # Máscara da frase
            mask = Image.new("L", (ph["width"], FONT_SIZE + 6), 0)
            mdrw = ImageDraw.Draw(mask)
            mdrw.text((0, 0), ph["text"], font=self.font, fill=255)

            # Degradê colorido
            grad = Image.fromarray(make_gradient(sung_w, mask.height))
            img.paste(grad, (x, y), mask.crop((0, 0, sung_w, mask.height)))

        if idx % 3000 == 0:
            mem = psutil.Process(os.getpid()).memory_info().rss / 2**20
            print(f"[DEBUG] frame {idx} – {mem:.0f} MiB")

        img.save(f"frames/{idx:08d}.png")


##############################################################################
# Pipeline principal                                                          #
##############################################################################

def main():
    phrases = parse_legenda(LEGENDA_PATH)
    font    = load_font()
    measure_phrases(phrases, font)

    total_frames = int(phrases[-1]["end"] * FPS) + 1
    os.makedirs("frames", exist_ok=True)

    task_q  = queue.Queue(maxsize=THREADS * 5)
    workers = [FrameWorker(task_q, phrases, font) for _ in range(THREADS)]
    for w in workers: w.start()

    for f in tqdm(range(total_frames), desc="Gerando frames"):
        task_q.put((f, f / FPS))

    task_q.join()
    for _ in workers: task_q.put(None)
    for w in workers: w.join()

    cmd = [
        "ffmpeg", "-y",
        "-framerate", str(FPS),
        "-i", "frames/%08d.png",
        "-c:v", "libx264", "-pix_fmt", "yuv420p",
        OUTPUT_VIDEO
    ]
    print("\n[FFmpeg] Renderizando vídeo final…")
    subprocess.run(cmd, check=True)
    print(f"✅ Pronto &rarr; {OUTPUT_VIDEO}")


if __name__ == "__main__":
    t0 = time.time()
    main()
    print(f"⏱️ Concluído em {time.time() - t0:.1f}s")

Tenho o arquivo legenda.psv, que segue essa lógica...

Pergunta

Python

Pensamento

Handling char spacing and gradient

Final script plan

Resposta

Pergunta

Pensamento

Resposta

Compartilhar esta Q&A