commit 30c86b97c78ea073da753b26ce14c3aa9ddc89bc
parent 130c5b6924ead47b5fd9865bcd19e4dd359d0bdf
Author: Pablo Murad <pblmrd@gmail.com>
Date: Sat, 9 May 2026 22:51:39 -0300
Mariela Boca Murcha
Diffstat:
12 files changed, 654 insertions(+), 125 deletions(-)
diff --git a/lazier/api/routes.py b/lazier/api/routes.py
@@ -17,7 +17,7 @@ from pydantic import BaseModel
from ..audio_processor import extract_audio_from_video
from ..core.formats import export
-from ..core.jobs import build_job_artifact_path, get_job_store, slugify_source_name
+from ..core.jobs import build_job_artifact_path, get_job_store, lazier_download_filename
from ..core.processing import process_source
from ..core.supported_sites import SUPPORTED_VIDEO_SITES
from ..core.playlist import is_playlist_url
@@ -52,22 +52,24 @@ def _resolve_mode(
summarize: Optional[bool],
) -> str:
if mode:
- if mode not in {"transcribe", "summarize"}:
- raise HTTPException(status_code=400, detail="Modo invalido. Use 'transcribe' ou 'summarize'.")
+ if mode not in {"transcribe", "summarize", "process"}:
+ raise HTTPException(
+ status_code=400,
+ detail="Modo invalido. Use 'process' (padrao), 'transcribe' ou 'summarize'.",
+ )
return mode
if transcribe is None and summarize is None:
- raise HTTPException(status_code=400, detail="Informe `mode`.")
+ return "process"
+ if transcribe and summarize:
+ return "process"
if transcribe and not summarize:
return "transcribe"
if summarize and not transcribe:
return "summarize"
- raise HTTPException(
- status_code=400,
- detail="A combinacao legada de transcribe/summarize nao aceita mais os dois modos ao mesmo tempo.",
- )
+ return "process"
def _job_title(job: dict) -> str:
@@ -86,10 +88,24 @@ def _job_title(job: dict) -> str:
def _progress_updater(job_id: str):
store = get_job_store()
- def callback(progress: int, status: str, message: Optional[str] = None) -> None:
+ def callback(
+ progress: int,
+ status: str,
+ message: Optional[str] = None,
+ *,
+ metadata_patch: Optional[dict] = None,
+ ) -> None:
updates = {"progress": progress, "status": status}
if status == "failed":
updates["error"] = message
+ if metadata_patch:
+ job_row = store.get_job(job_id)
+ if job_row:
+ base = dict(job_row.get("metadata") or {})
+ for key in ("title", "webpage_url"):
+ if key in metadata_patch and metadata_patch[key] is not None:
+ base[key] = metadata_patch[key]
+ updates["metadata"] = base
store.update_job(job_id, **updates)
broadcast_progress(job_id, progress, status, message)
@@ -383,6 +399,11 @@ async def get_job_status(job_id: str):
"mode": job["mode"],
"status": job["status"],
"progress": job.get("progress", 0),
+ "title": _job_title(job),
+ "url": job.get("source_url"),
+ "file_path": job.get("file_path"),
+ "format": job.get("format"),
+ "created_at": job.get("created_at"),
"result_path": job.get("result_path"),
"transcription_path": job.get("transcription_path"),
"summary_path": job.get("summary_path"),
@@ -489,8 +510,7 @@ async def download_bundle(job_id: str, background_tasks: BackgroundTasks):
tmp_path.unlink(missing_ok=True)
raise
- slug = slugify_source_name(job.get("source_name"))
- zip_filename = f"{slug}-{job_id[:8]}-tudo.zip"
+ zip_filename = lazier_download_filename("zip")
background_tasks.add_task(_unlink_quiet, str(tmp_path))
return FileResponse(str(tmp_path), media_type="application/zip", filename=zip_filename)
diff --git a/lazier/cli.py b/lazier/cli.py
@@ -31,7 +31,7 @@ console = Console()
def _progress_notifier(progress_bar, task_id):
- def callback(progress: int, _status: str, message: Optional[str] = None):
+ def callback(progress: int, _status: str, message: Optional[str] = None, *, metadata_patch=None):
if message:
progress_bar.update(task_id, description=f"[cyan]{message}")
progress_bar.update(task_id, completed=progress)
@@ -106,7 +106,11 @@ def _run_mode(
sys.exit(1)
console.print(f"\n[bold green]✓ Processamento concluido![/bold green]")
- console.print(f"[cyan]Arquivo gerado:[/cyan] {result['result_path']}")
+ console.print(f"[cyan]Arquivo principal:[/cyan] {result['result_path']}")
+ if result.get("transcription_path") and result["transcription_path"] != result.get("result_path"):
+ console.print(f"[cyan]Transcricao:[/cyan] {result['transcription_path']}")
+ if result.get("summary_path") and result["summary_path"] != result.get("result_path"):
+ console.print(f"[cyan]Sumario:[/cyan] {result['summary_path']}")
if result.get("content_type"):
console.print(f"[cyan]Tipo detectado:[/cyan] {result['content_type']}")
if result.get("chapters"):
@@ -125,9 +129,9 @@ def cli(ctx, input_path):
if input_path:
console.print(
Panel.fit(
- "Use comandos explicitos:\n\n"
- "`lazier transcribe <input>` para transcrever em portugues\n"
- "`lazier summarize <input>` para gerar um sumario em portugues",
+ "Fluxo unificado: transcrição em PT-BR + sumário (ajuste com LAZIER_ALWAYS_SUMMARY).\n\n"
+ "`lazier process <input>` — fluxo unificado explícito.\n"
+ "`lazier transcribe` / `lazier summarize` — modos legados (ver LAZIER_ALWAYS_SUMMARY).",
title="Modo de Uso",
)
)
@@ -182,7 +186,7 @@ def transcribe(
chapters_flag: Optional[bool],
reasoning: Optional[str],
):
- """Transcreve ou converte o conteudo para portugues."""
+ """Converte/transcreve para PT-BR; com LAZIER_ALWAYS_SUMMARY=true também gera sumário."""
_run_mode(
input_path=input_path,
mode="transcribe",
@@ -211,7 +215,7 @@ def summarize(
chapters_flag: Optional[bool],
reasoning: Optional[str],
):
- """Gera um sumario em portugues do conteudo informado."""
+ """Foco em sumário; com LAZIER_ALWAYS_SUMMARY=true também exporta transcrição completa."""
_run_mode(
input_path=input_path,
mode="summarize",
@@ -226,6 +230,35 @@ def summarize(
@cli.command()
+@click.argument("input_path", type=str)
+@click.option("--output", "-o", type=str, help="Nome do arquivo de saida")
+@click.option("--format", "-f", "format_type", type=click.Choice(["docx", "txt", "md", "json", "pdf"]), default="docx", help="Formato de saida")
+@_model_options
+def process(
+ input_path: str,
+ output: Optional[str],
+ format_type: str,
+ model: Optional[str],
+ gpt_model: Optional[str],
+ smart_flag: Optional[bool],
+ chapters_flag: Optional[bool],
+ reasoning: Optional[str],
+):
+ """Executa o pipeline unificado (transcrição + sumário em PT-BR)."""
+ _run_mode(
+ input_path=input_path,
+ mode="process",
+ output=output,
+ format_type=format_type,
+ model=model,
+ gpt_model=gpt_model,
+ smart=smart_flag,
+ chapters=chapters_flag,
+ reasoning=reasoning,
+ )
+
+
+@cli.command()
def config():
"""Mostra a configuracao corrente de modelos."""
cfg = get_model_config(refresh=True)
@@ -240,6 +273,8 @@ def config():
f"Capitulos: [cyan]{'on' if cfg.enable_chapters else 'off'}[/cyan]\n"
f"Preset qualidade: [cyan]{cfg.quality_preset}[/cyan]\n"
f"Sumario hierarquico: [cyan]{'on' if cfg.hierarchical_summary else 'off'}[/cyan]\n"
+ f"Sempre gerar sumario (LAZIER_ALWAYS_SUMMARY): [cyan]{'on' if cfg.always_summary else 'off'}[/cyan]\n"
+ f"Diarizacao (LAZIER_DIARIZATION_PROVIDER): [cyan]{cfg.diarization_provider or 'none'}[/cyan]\n"
f"STT alternativo (flag): [cyan]{'on' if cfg.alt_stt_enabled else 'off'}[/cyan]",
title="Lazier - Config Atual",
)
diff --git a/lazier/core/config.py b/lazier/core/config.py
@@ -107,6 +107,8 @@ class ModelConfig:
summary_map_chunk_chars: int = 14_000
summary_chunk_overlap_chars: int = 1_400
alt_stt_enabled: bool = False
+ always_summary: bool = True
+ diarization_provider: str = "none"
def supports_reasoning(self, model: Optional[str] = None) -> bool:
"""Indica se devemos enviar `reasoning_effort` para um modelo."""
@@ -153,6 +155,8 @@ def get_model_config(refresh: bool = False) -> ModelConfig:
summary_map_chunk_chars=_env_int("LAZIER_SUMMARY_MAP_CHUNK_CHARS", 14_000, min_value=2_000),
summary_chunk_overlap_chars=_env_int("LAZIER_SUMMARY_CHUNK_OVERLAP_CHARS", 1_400, min_value=0),
alt_stt_enabled=_env_bool("LAZIER_ALT_STT_ENABLED", False),
+ always_summary=_env_bool("LAZIER_ALWAYS_SUMMARY", True),
+ diarization_provider=_env_str("LAZIER_DIARIZATION_PROVIDER", "none").lower(),
)
return _cached_config
diff --git a/lazier/core/jobs.py b/lazier/core/jobs.py
@@ -5,6 +5,7 @@ Persistencia de jobs e organizacao de arquivos de saida.
import json
import os
import re
+import secrets
import sqlite3
import threading
import unicodedata
@@ -14,11 +15,14 @@ from typing import Any, Dict, List, Optional, Union
TERMINAL_STATUSES = {"completed", "failed", "interrupted"}
-ARTIFACT_FILENAMES = {
- "transcription": "transcricao",
- "summary": "sumario",
- "result": "resultado",
-}
+ARTIFACT_KINDS = frozenset({"transcription", "summary", "result"})
+
+
+def lazier_download_filename(format_type: str) -> str:
+ """Nome estável para downloads: lazier_<6 dígitos aleatórios>.<ext> (sem espaços ou unicode)."""
+ n = secrets.randbelow(900_000) + 100_000
+ ext = format_type.lstrip(".")
+ return f"lazier_{n}.{ext}"
def _coerce_datetime(value: Optional[Union[str, datetime]]) -> datetime:
@@ -72,7 +76,7 @@ def build_job_artifact_path(
created_at: Optional[Union[str, datetime]] = None,
output_root: Optional[Path] = None,
) -> Path:
- if artifact_kind not in ARTIFACT_FILENAMES:
+ if artifact_kind not in ARTIFACT_KINDS:
raise ValueError(f"Artefato nao suportado: {artifact_kind}")
output_dir = build_job_output_dir(
@@ -82,8 +86,7 @@ def build_job_artifact_path(
output_root=output_root,
)
output_dir.mkdir(parents=True, exist_ok=True)
- filename = ARTIFACT_FILENAMES[artifact_kind]
- return output_dir / f"{filename}.{format_type}"
+ return output_dir / lazier_download_filename(format_type)
class JobStore:
diff --git a/lazier/core/processing.py b/lazier/core/processing.py
@@ -5,11 +5,12 @@ Inclui as etapas de:
- Validacao de input
- Download (YouTube/web) ou preparacao de audio
- Transcricao (com ou sem timestamps)
-- Conversao para PT-BR
+- Diarizacao opcional (LAZIER_DIARIZATION_PROVIDER) antes da conversao para PT-BR
+- Conversao para PT-BR + revisao ortografica leve (polish_pt_br_text)
- Deteccao de tipo de conteudo
-- Sumario (legado ou estruturado)
+- Sumario (legado ou estruturado; LAZIER_ALWAYS_SUMMARY=false recupera fluxos antigos)
- Capitulos com timestamps
-- Export final
+- Export final (dois artefactos quando transcrição + sumário e saida por job)
"""
from __future__ import annotations
@@ -29,8 +30,10 @@ from .exceptions import MusicContentError
from .jobs import build_job_artifact_path, get_outputs_root
from ..audio_processor import prepare_audio_file
from ..downloader import download_video_audio, download_youtube_audio
+from ..diarization import maybe_enrich_transcript_with_diarization
from ..summarizer import (
format_smart_summary_as_text,
+ polish_pt_br_text,
render_text_in_portuguese,
summarize_smart,
summarize_text,
@@ -40,16 +43,38 @@ from ..utils import cleanup_files, validate_input
from ..web.extractor import extract_pdf_content, extract_text_file_content, extract_web_content
from .formats import export
-ProgressCallback = Optional[Callable[[int, str, Optional[str]], None]]
+ProgressCallback = Optional[Callable[..., None]]
MEDIA_INPUT_TYPES = {"audio", "video", "youtube"}
_pipeline_log = logging.getLogger("lazier.pipeline")
-def _notify(callback: ProgressCallback, progress: int, status: str, message: Optional[str] = None) -> None:
+def _early_title_patch(metadata: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+ """Campos seguros para mostrar título cedo na WebGUI (merge na metadata do job)."""
+ title = metadata.get("title")
+ if not title:
+ return None
+ patch: Dict[str, Any] = {"title": str(title)}
+ wu = metadata.get("webpage_url")
+ if wu:
+ patch["webpage_url"] = str(wu)
+ return patch
+
+
+def _notify(
+ callback: ProgressCallback,
+ progress: int,
+ status: str,
+ message: Optional[str] = None,
+ *,
+ metadata_patch: Optional[Dict[str, Any]] = None,
+) -> None:
if callback:
- callback(progress, status, message)
+ if metadata_patch:
+ callback(progress, status, message, metadata_patch=metadata_patch)
+ else:
+ callback(progress, status, message)
def _get_cache_manager_safe():
@@ -68,11 +93,26 @@ def _ensure_api_key() -> None:
def _ensure_mode(mode: str) -> str:
- if mode not in {"transcribe", "summarize"}:
- raise ValueError("Modo invalido. Use 'transcribe' ou 'summarize'.")
+ if mode not in {"transcribe", "summarize", "process"}:
+ raise ValueError("Modo invalido. Use 'transcribe', 'summarize' ou 'process'.")
return mode
+def _maybe_polish_pt_br(text: Optional[str], kind: str, runtime: Dict[str, Any]) -> Optional[str]:
+ if not text or not str(text).strip():
+ return text
+ try:
+ return polish_pt_br_text(
+ text,
+ kind=kind,
+ model=runtime["chat_model"],
+ reasoning_effort=runtime["reasoning_effort"],
+ )
+ except Exception as exc:
+ _pipeline_log.warning("polish_pt_br falhou (%s): %s", kind, exc)
+ return text
+
+
def _pipeline_trace(job_id: Optional[str], step: str, **fields: Any) -> None:
if not job_id:
return
@@ -95,28 +135,42 @@ def _resolve_runtime(
"chapters_enabled": config.enable_chapters if use_chapters is None else use_chapters,
"reasoning_effort": config.reasoning_effort,
"quality_preset": config.quality_preset,
+ "always_summary": config.always_summary,
}
-def _export_selected_artifact(
+def _export_single_user_path(
mode: str,
format_type: str,
output_path: str,
transcription: str,
summary: Optional[str],
metadata: Dict[str, Any],
+ *,
+ prefer_summary: bool,
) -> str:
- if mode == "transcribe":
+ """Um único ficheiro (-o / CLI): documento completo quando há transcrição e sumário."""
+ has_tx = bool(transcription and transcription.strip())
+ has_sm = bool(summary and str(summary).strip())
+ if has_tx and has_sm:
return export(
transcription=transcription,
- summary=None,
+ summary=summary,
+ metadata=metadata,
+ output_path=output_path,
+ format_type=format_type,
+ )
+ if has_sm and (not has_tx or prefer_summary):
+ return export(
+ transcription="",
+ summary=summary,
metadata=metadata,
output_path=output_path,
format_type=format_type,
)
return export(
- transcription="",
- summary=summary,
+ transcription=transcription or "",
+ summary=None,
metadata=metadata,
output_path=output_path,
format_type=format_type,
@@ -166,6 +220,13 @@ def _transcribe_media(
_notify(progress_callback, progress_start, "processing", "Transcrevendo audio...")
raw_text = transcribe_audio(audio_file, language=None, model=runtime["transcribe_model"])
+ raw_text, segments = maybe_enrich_transcript_with_diarization(
+ audio_file,
+ raw_text,
+ segments,
+ metadata,
+ )
+
_notify(progress_callback, midpoint, "processing", "Convertendo conteudo para portugues...")
portuguese_text = render_text_in_portuguese(
raw_text,
@@ -376,6 +437,7 @@ def process_source(
reasoning_effort=runtime["reasoning_effort"],
smart_summary=runtime["smart_summary"],
chapters_enabled=runtime["chapters_enabled"],
+ always_summary=runtime["always_summary"],
)
if input_type == "youtube":
@@ -390,11 +452,24 @@ def process_source(
metadata["smart_summary"] = cached["smart_summary"]
if cached.get("chapters"):
metadata["chapters"] = cached["chapters"]
- _notify(progress_callback, 70, "processing", "Conteudo do YouTube encontrado no cache")
+ _notify(
+ progress_callback,
+ 70,
+ "processing",
+ "Conteudo do YouTube encontrado no cache",
+ metadata_patch=_early_title_patch(metadata),
+ )
else:
_notify(progress_callback, 15, "processing", "Baixando video do YouTube...")
audio_file, metadata = download_youtube_audio(source)
files_to_cleanup.append(audio_file)
+ _notify(
+ progress_callback,
+ 17,
+ "processing",
+ "Video obtido; a transcrever...",
+ metadata_patch=_early_title_patch(metadata),
+ )
portuguese_text, segments = _transcribe_media(
audio_file,
runtime=runtime,
@@ -409,6 +484,13 @@ def process_source(
_notify(progress_callback, 15, "processing", "Tentando extrair audio da URL...")
audio_file, metadata = download_video_audio(source)
files_to_cleanup.append(audio_file)
+ _notify(
+ progress_callback,
+ 16,
+ "processing",
+ "Audio da pagina obtido.",
+ metadata_patch=_early_title_patch(metadata),
+ )
cached = cache.get("video", url_hash) if cache else None
if cached and cached.get("transcription"):
metadata = cached.get("metadata", metadata) or metadata
@@ -419,7 +501,13 @@ def process_source(
metadata["smart_summary"] = cached["smart_summary"]
if cached.get("chapters"):
metadata["chapters"] = cached["chapters"]
- _notify(progress_callback, 70, "processing", "Conteudo de video encontrado no cache")
+ _notify(
+ progress_callback,
+ 70,
+ "processing",
+ "Conteudo de video encontrado no cache",
+ metadata_patch=_early_title_patch(metadata),
+ )
else:
portuguese_text, segments = _transcribe_media(
audio_file,
@@ -438,7 +526,13 @@ def process_source(
summary = cached.get("summary")
if cached.get("smart_summary"):
metadata["smart_summary"] = cached["smart_summary"]
- _notify(progress_callback, 70, "processing", "Conteudo web encontrado no cache")
+ _notify(
+ progress_callback,
+ 70,
+ "processing",
+ "Conteudo web encontrado no cache",
+ metadata_patch=_early_title_patch(metadata),
+ )
else:
_notify(progress_callback, 20, "processing", "Extraindo texto da pagina web...")
content_data = extract_web_content(source)
@@ -446,6 +540,13 @@ def process_source(
"title": content_data.get("title", "Pagina Web"),
"webpage_url": source,
}
+ _notify(
+ progress_callback,
+ 22,
+ "processing",
+ "Pagina obtida.",
+ metadata_patch=_early_title_patch(metadata),
+ )
portuguese_text = render_text_in_portuguese(
content_data["content"],
model=runtime["chat_model"],
@@ -476,6 +577,13 @@ def process_source(
"title": content_data.get("title", "Documento"),
"file_path": source,
}
+ _notify(
+ progress_callback,
+ 22,
+ "processing",
+ "Documento lido.",
+ metadata_patch=_early_title_patch(metadata),
+ )
portuguese_text = render_text_in_portuguese(
content_data["content"],
model=runtime["chat_model"],
@@ -490,6 +598,9 @@ def process_source(
# ---- Etapas pos-transcricao comuns a todos os tipos ----
if portuguese_text:
+ portuguese_text = _maybe_polish_pt_br(portuguese_text, "transcription", runtime) or ""
+
+ if portuguese_text:
_detect_and_attach_content_type(portuguese_text, metadata, runtime)
chapters = _build_chapters_if_possible(
@@ -499,7 +610,13 @@ def process_source(
progress_callback=progress_callback,
)
- if mode == "summarize" and not summary:
+ always_sum = runtime["always_summary"]
+ text_for_summary = (portuguese_text or "").strip()
+ should_build_summary = bool(text_for_summary) and (
+ always_sum or mode in {"summarize", "process"}
+ )
+
+ if should_build_summary and not summary:
summary = _build_summary(
portuguese_text or "",
metadata=metadata,
@@ -507,7 +624,7 @@ def process_source(
progress_callback=progress_callback,
trace_job_id=trace_id,
)
- elif mode == "summarize" and summary and runtime["smart_summary"] and not metadata.get("smart_summary"):
+ elif should_build_summary and summary and runtime["smart_summary"] and not metadata.get("smart_summary"):
# Cache antigo guardou apenas summary textual; reaproveitamos como tldr.
metadata["smart_summary"] = {
"tldr": summary[:600],
@@ -519,6 +636,9 @@ def process_source(
"open_questions": [],
}
+ if summary:
+ summary = _maybe_polish_pt_br(summary, "summary", runtime)
+
# ---- Atualiza caches "ricos" para fontes de midia ----
if cache and input_type == "youtube":
cache.set(
@@ -563,32 +683,118 @@ def process_source(
resolved_source_name = source_name or metadata.get("title") or source
+ has_tx = bool(portuguese_text and str(portuguese_text).strip())
+ has_sm = bool(summary and str(summary).strip())
+ dual_artifacts = (
+ not output_path
+ and has_tx
+ and has_sm
+ and (runtime["always_summary"] or mode == "process")
+ )
+
+ _notify(progress_callback, 92, "processing", f"Gerando arquivo {output_format.upper()}...")
+
+ transcription_path: Optional[str] = None
+ summary_path: Optional[str] = None
+ result_path: Optional[str] = None
+
if output_path:
final_output_path = Path(output_path)
if final_output_path.suffix != f".{output_format}":
final_output_path = final_output_path.with_suffix(f".{output_format}")
- else:
- artifact_kind = "transcription" if mode == "transcribe" else "summary"
- final_output_path = build_job_artifact_path(
+ exported_path = _export_single_user_path(
+ mode,
+ output_format,
+ str(final_output_path),
+ portuguese_text or "",
+ summary,
+ metadata,
+ prefer_summary=(mode == "summarize"),
+ )
+ result_path = exported_path
+ if has_tx and has_sm:
+ transcription_path = None
+ summary_path = None
+ elif has_sm:
+ transcription_path = None
+ summary_path = exported_path
+ else:
+ transcription_path = exported_path
+ summary_path = None
+ elif dual_artifacts:
+ tx_path = build_job_artifact_path(
job_id=run_id,
source_name=resolved_source_name,
format_type=output_format,
- artifact_kind=artifact_kind,
+ artifact_kind="transcription",
created_at=created_at,
output_root=output_root,
)
+ sm_path = build_job_artifact_path(
+ job_id=run_id,
+ source_name=resolved_source_name,
+ format_type=output_format,
+ artifact_kind="summary",
+ created_at=created_at,
+ output_root=output_root,
+ )
+ export(
+ transcription=portuguese_text or "",
+ summary=None,
+ metadata=metadata,
+ output_path=str(tx_path),
+ format_type=output_format,
+ )
+ export(
+ transcription="",
+ summary=summary,
+ metadata=metadata,
+ output_path=str(sm_path),
+ format_type=output_format,
+ )
+ transcription_path = str(tx_path)
+ summary_path = str(sm_path)
+ result_path = summary_path or transcription_path
+ elif has_sm:
+ sm_only = build_job_artifact_path(
+ job_id=run_id,
+ source_name=resolved_source_name,
+ format_type=output_format,
+ artifact_kind="summary",
+ created_at=created_at,
+ output_root=output_root,
+ )
+ exported_path = export(
+ transcription="",
+ summary=summary,
+ metadata=metadata,
+ output_path=str(sm_only),
+ format_type=output_format,
+ )
+ result_path = exported_path
+ summary_path = exported_path
+ elif has_tx:
+ tx_only = build_job_artifact_path(
+ job_id=run_id,
+ source_name=resolved_source_name,
+ format_type=output_format,
+ artifact_kind="transcription",
+ created_at=created_at,
+ output_root=output_root,
+ )
+ exported_path = export(
+ transcription=portuguese_text or "",
+ summary=None,
+ metadata=metadata,
+ output_path=str(tx_only),
+ format_type=output_format,
+ )
+ result_path = exported_path
+ transcription_path = exported_path
+ else:
+ raise Exception("Nenhum conteudo gerado para exportar.")
- _notify(progress_callback, 92, "processing", f"Gerando arquivo {output_format.upper()}...")
- exported_path = _export_selected_artifact(
- mode=mode,
- format_type=output_format,
- output_path=str(final_output_path),
- transcription=portuguese_text or "",
- summary=summary,
- metadata=metadata,
- )
-
- output_dir = str(Path(exported_path).parent)
+ output_dir = str(Path(result_path or ".").parent)
result = {
"mode": mode,
"input_type": input_type,
@@ -599,9 +805,9 @@ def process_source(
"smart_summary": metadata.get("smart_summary"),
"chapters": chapters,
"content_type": metadata.get("content_type"),
- "result_path": exported_path,
- "transcription_path": exported_path if mode == "transcribe" else None,
- "summary_path": exported_path if mode == "summarize" else None,
+ "result_path": result_path,
+ "transcription_path": transcription_path,
+ "summary_path": summary_path,
"output_dir": output_dir,
}
_pipeline_trace(
diff --git a/lazier/diarization.py b/lazier/diarization.py
@@ -0,0 +1,153 @@
+"""
+Diarização opcional de falantes após STT.
+
+Provedores futuros (AssemblyAI, pyannote, etc.) devem preencher intervalos
+``{start, end, speaker_id}`` em segundos; o texto é reorganizado em blocos
+``**Falante N**`` alinhados aos segmentos do Whisper quando possível.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+from .core.config import get_model_config
+
+_log = logging.getLogger(__name__)
+
+
+def _segment_midpoint_seconds(seg: Dict[str, Any]) -> float:
+ start = float(seg.get("start", 0) or 0)
+ end = float(seg.get("end", start) or start)
+ return (start + end) / 2.0
+
+
+def _speaker_key(interval: Dict[str, Any]) -> str:
+ sid = interval.get("speaker_id", interval.get("speaker"))
+ if sid is None:
+ return "0"
+ return str(sid)
+
+
+def _build_speaker_rank(intervals: List[Dict[str, Any]]) -> Dict[str, int]:
+ """Mapeia identificador bruto do provedor para Falante 1..N (ordem de primeira ocorrência)."""
+ seen: List[str] = []
+ for iv in intervals:
+ k = _speaker_key(iv)
+ if k not in seen:
+ seen.append(k)
+ return {k: i + 1 for i, k in enumerate(seen)}
+
+
+def _speaker_num_at_time(
+ t: float, intervals: List[Dict[str, Any]], rank: Dict[str, int]
+) -> int:
+ for iv in intervals:
+ try:
+ s = float(iv.get("start", 0))
+ e = float(iv.get("end", s))
+ except (TypeError, ValueError):
+ continue
+ if s <= t <= e:
+ k = _speaker_key(iv)
+ return rank.get(k, 1)
+ return 1
+
+
+def merge_transcript_with_speakers(
+ raw_text: str,
+ segments: List[Dict[str, Any]],
+ intervals: List[Dict[str, Any]],
+) -> str:
+ """Junta texto bruto ou segmentos Whisper com rótulos **Falante N** (blocos consecutivos)."""
+ if not intervals:
+ return raw_text
+
+ rank = _build_speaker_rank(intervals)
+
+ if segments:
+ parts: List[str] = []
+ current_sp: Optional[int] = None
+ buffer: List[str] = []
+
+ def flush() -> None:
+ nonlocal buffer, current_sp
+ if current_sp is None or not buffer:
+ return
+ text = " ".join(buffer).strip()
+ if text:
+ parts.append(f"**Falante {current_sp}**\n\n{text}")
+ buffer = []
+
+ for seg in segments:
+ text = (seg.get("text") or "").strip()
+ if not text:
+ continue
+ sp = _speaker_num_at_time(_segment_midpoint_seconds(seg), intervals, rank)
+ if current_sp is not None and sp != current_sp:
+ flush()
+ current_sp = sp
+ buffer.append(text)
+ flush()
+ return "\n\n".join(parts).strip() if parts else raw_text
+
+ lines_out: List[str] = []
+ current_sp = 1
+ for para in raw_text.split("\n\n"):
+ p = para.strip()
+ if not p:
+ continue
+ lines_out.append(f"**Falante {current_sp}**\n\n{p}")
+ current_sp += 1
+ return "\n\n".join(lines_out).strip() if lines_out else raw_text
+
+
+def diarize_audio_intervals(audio_path: str, provider: str) -> List[Dict[str, Any]]:
+ """
+ Executa diarização externa. Retorna lista vazia se desligado ou não implementado.
+
+ Para integrar um provedor real, implemente aqui e defina LAZIER_DIARIZATION_PROVIDER.
+ """
+ p = (provider or "").strip().lower()
+ if p in ("", "none", "off", "false"):
+ return []
+
+ if p in ("assemblyai", "deepgram", "google", "pyannote"):
+ _log.warning(
+ "Diarização com provedor '%s' ainda não está integrada; "
+ "use LAZIER_DIARIZATION_PROVIDER=none ou contribua com o conector.",
+ p,
+ )
+ return []
+
+ _log.warning("LAZIER_DIARIZATION_PROVIDER='%s' desconhecido; ignorando.", provider)
+ return []
+
+
+def maybe_enrich_transcript_with_diarization(
+ audio_path: str,
+ raw_text: str,
+ segments: List[Dict[str, Any]],
+ metadata: Dict[str, Any],
+) -> Tuple[str, List[Dict[str, Any]]]:
+ """
+ Opcionalmente altera o texto bruto (pré PT-BR) com marcadores **Falante N**.
+ ``segments`` são os do Whisper (opcionalmente vazios).
+ """
+ cfg = get_model_config()
+ provider = (cfg.diarization_provider or "none").strip().lower()
+ metadata["diarization_provider"] = provider
+
+ if provider in ("", "none", "off", "false"):
+ metadata["diarization_status"] = "desligada"
+ return raw_text, segments
+
+ intervals = diarize_audio_intervals(audio_path, provider)
+ if not intervals:
+ metadata["diarization_status"] = "indisponivel"
+ return raw_text, segments
+
+ merged = merge_transcript_with_speakers(raw_text, segments, intervals)
+ metadata["diarization_status"] = "ativa"
+ metadata["diarization_segments"] = len(intervals)
+ return merged, segments
diff --git a/lazier/summarizer.py b/lazier/summarizer.py
@@ -238,7 +238,8 @@ def _render_portuguese_chunk(text: str, model: str, reasoning_effort: Optional[s
"- Se o texto ja estiver em portugues, mantenha em portugues do Brasil natural.\n"
"- Nao resuma, nao explique, nao comente o texto.\n"
"- Preserve nomes proprios, numeros, datas, links, listas e estrutura.\n"
- "- Mantenha o maximo de fidelidade possivel ao conteudo original.\n\n"
+ "- Mantenha o maximo de fidelidade possivel ao conteudo original.\n"
+ "- Use acentuacao e ortografia corretas em portugues do Brasil.\n\n"
"Texto:\n\n"
)
@@ -261,6 +262,86 @@ def _render_portuguese_chunk(text: str, model: str, reasoning_effort: Optional[s
raise _wrap_chat_error(exc, "converter texto para portugues") from exc
+POLISH_PT_BR_CHUNK_CHARS = 24_000
+
+
+def polish_pt_br_text(
+ text: str,
+ kind: str = "transcription",
+ model: Optional[str] = None,
+ reasoning_effort: Optional[str] = None,
+) -> str:
+ """
+ Revisao ortografica leve em pt-BR: acentos, grafia, paragrafos.
+ Nao altera factos, nomes proprios, numeros, links nem marcadores **Falante N**.
+ """
+
+ if not text or not text.strip():
+ return text or ""
+
+ config = get_model_config()
+ chosen_model = model or config.chat_model
+ max_chars = min(POLISH_PT_BR_CHUNK_CHARS, DEFAULT_CHUNK_CHAR_LIMIT)
+
+ kind_norm = (kind or "transcription").strip().lower()
+ if kind_norm == "summary":
+ kind_hint = (
+ "Este trecho e um sumario ou sintese: mantenha tom conciso e bullet/numeracao se ja existir."
+ )
+ else:
+ kind_hint = (
+ "Este trecho e transcricao ou texto corrido: preserve a ordem e falas; "
+ "nao resuma nem omita frases."
+ )
+
+ if len(text) <= max_chars:
+ return _polish_pt_br_chunk(text, chosen_model, kind_hint, reasoning_effort=reasoning_effort)
+
+ chunks = _split_text_into_chunks(text, max_chars)
+ out: List[str] = []
+ for i, chunk in enumerate(chunks):
+ print(f"Revisao ortografica pt-BR: parte {i + 1}/{len(chunks)}...")
+ out.append(_polish_pt_br_chunk(chunk, chosen_model, kind_hint, reasoning_effort=reasoning_effort))
+ return "\n\n".join(c.strip() for c in out if c.strip())
+
+
+def _polish_pt_br_chunk(
+ text: str,
+ model: str,
+ kind_hint: str,
+ reasoning_effort: Optional[str] = None,
+) -> str:
+ client = _ensure_client()
+ prompt = (
+ "Revise o texto em portugues do Brasil.\n\n"
+ f"{kind_hint}\n\n"
+ "Regras obrigatorias:\n"
+ "- Corrija acentuacao e ortografia em pt-BR.\n"
+ "- Melhore quebras de linha e paragrafos quando ajudar a leitura.\n"
+ "- Nao altere factos, dados, numeros, datas, nomes proprios, codigos, URLs.\n"
+ "- Preserve linhas que comecam com **Falante N** e a estrutura logo abaixo delas.\n"
+ "- Nao adicione comentarios meta; devolva apenas o texto revisado.\n\n"
+ "Texto:\n\n"
+ )
+ try:
+ kwargs = _chat_completions_kwargs(
+ model=model,
+ messages=[
+ {
+ "role": "system",
+ "content": "Voce e um revisor editorial em portugues do Brasil, preciso e conservador.",
+ },
+ {"role": "user", "content": prompt + text},
+ ],
+ temperature=0.05,
+ reasoning_effort=reasoning_effort,
+ )
+ response = client.chat.completions.create(**kwargs)
+ return (response.choices[0].message.content or "").strip()
+ except Exception as exc:
+ raise _wrap_chat_error(exc, "revisar ortografia em portugues") from exc
+
+
# ---------------------------------------------------------------------------
# Sumario textual legado
# ---------------------------------------------------------------------------
diff --git a/lazier/web/templates/index.html b/lazier/web/templates/index.html
@@ -319,7 +319,7 @@
<header class="topbar">
<div class="brand">
<h1>Lazier</h1>
- <p>Transcrição ou sumário em português do Brasil.</p>
+ <p>Transcrição e sumário em português do Brasil em cada processamento.</p>
</div>
<nav class="nav">
<a href="#" class="active" onclick="showPage('process');return false;">Processar</a>
@@ -329,18 +329,12 @@
<section id="page-process" class="page active">
<div class="section-title">Novo processamento</div>
- <p class="section-lead">A saída final será em português do Brasil.</p>
+ <p class="section-lead">Gera transcrição completa e sumário em PT-BR; escolha no download o que quer guardar.</p>
+ <p class="subtle" style="margin-top:-8px;">Diarização de falantes é opcional (LAZIER_DIARIZATION_PROVIDER); sem ela, o texto não traz separação por falante.</p>
<div class="stack">
<div class="upload" id="uploadArea"><input type="file" id="fileInput" hidden multiple accept="audio/*,video/*,application/pdf,text/plain,text/markdown,text/html,.md,.htm,.txt"></div>
<div class="field"><label for="urlInput">URL (opcional)</label><input type="text" id="urlInput" placeholder="YouTube, página, áudio em linha…"></div>
- <div class="field">
- <label>Modo</label>
- <div class="mode-grid">
- <div class="mode selected" onclick="selectMode('transcribe',this)"><input type="radio" checked value="transcribe"><strong>Transcrever</strong><span class="meta">Texto completo em PT-BR.</span></div>
- <div class="mode" onclick="selectMode('summarize',this)"><input type="radio" value="summarize"><strong>Resumir</strong><span class="meta">Sumário em PT-BR.</span></div>
- </div>
- </div>
<div class="row-actions">
<div class="field">
<label for="formatSelect">Formato</label>
@@ -389,7 +383,7 @@
<script>
let selectedFiles = [];
- let processingMode = 'transcribe';
+ let processingMode = 'process';
let allJobs = [];
let currentFilter = 'all';
@@ -420,12 +414,6 @@
}
function removeFile(index) { selectedFiles.splice(index, 1); document.getElementById('fileInput').value = ''; renderUpload(); }
- function selectMode(mode, element) {
- processingMode = mode;
- document.querySelectorAll('.mode').forEach((node) => node.classList.remove('selected'));
- element.classList.add('selected');
- }
-
function showPage(page) {
document.querySelectorAll('.page').forEach((node) => node.classList.remove('active'));
document.querySelectorAll('.nav a').forEach((node) => node.classList.remove('active'));
@@ -474,7 +462,10 @@
function renderJob(job) {
const title = job.title || job.url || job.file_path || `Job ${job.id}`;
const statusLabel = { pending:'Pendente', processing:'A processar', completed:'Concluído', failed:'Falhou', interrupted:'Interrompido' }[job.status] || job.status;
- return `<div class="job-head"><div><div class="job-title">${escapeHtml(title)}</div><div class="chips">${job.mode ? `<span class="chip">${job.mode === 'transcribe' ? 'Transcrição' : 'Sumário'}</span>` : ''}${job.format ? `<span class="chip">${job.format.toUpperCase()}</span>` : ''}${job.created_at ? `<span class="chip">${new Date(job.created_at).toLocaleString('pt-BR')}</span>` : ''}</div></div><span class="status ${job.status}">${statusLabel}</span></div><div class="bar"><span style="width:${job.progress || 0}%"></span></div>${job.error ? `<div class="error">${escapeHtml(job.error)}</div>` : ''}${renderActions(job)}`;
+ const modeChip = job.mode === 'process'
+ ? 'Transcrição + sumário'
+ : (job.mode === 'transcribe' ? 'Transcrição' : (job.mode === 'summarize' ? 'Sumário' : ''));
+ return `<div class="job-head"><div><div class="job-title">${escapeHtml(title)}</div><div class="chips">${modeChip ? `<span class="chip">${modeChip}</span>` : ''}${job.format ? `<span class="chip">${job.format.toUpperCase()}</span>` : ''}${job.created_at ? `<span class="chip">${new Date(job.created_at).toLocaleString('pt-BR')}</span>` : ''}</div></div><span class="status ${job.status}">${statusLabel}</span></div><div class="bar"><span style="width:${job.progress || 0}%"></span></div>${job.error ? `<div class="error">${escapeHtml(job.error)}</div>` : ''}${renderActions(job)}`;
}
function renderActions(job) {
@@ -486,35 +477,17 @@
const hrefDl = `/api/jobs/${id}/download`;
const hrefBundle = `/api/jobs/${id}/download-bundle`;
+ if (job.has_transcription) {
+ html += `<a class="action" href="${hrefTx}">Transcrição</a>`;
+ }
+ if (job.has_summary) {
+ html += `<a class="action" href="${hrefSum}">Sumário</a>`;
+ }
if (job.has_transcription && job.has_summary) {
- html += `<a class="action" href="${hrefBundle}">Tudo</a>`;
+ html += `<a class="action" href="${hrefBundle}">Pacote ZIP</a>`;
}
-
- if (job.mode === 'transcribe') {
- if (job.has_transcription || job.result_path) {
- html += `<a class="action" href="${hrefTx}">Transcrição</a>`;
- }
- if (job.has_summary) {
- html += `<a class="action" href="${hrefSum}">Sumário</a>`;
- }
- } else if (job.mode === 'summarize') {
- if (job.has_summary || job.result_path) {
- html += `<a class="action" href="${hrefSum}">Sumário</a>`;
- }
- if (job.has_transcription) {
- html += `<a class="action" href="${hrefTx}">Transcrição completa</a>`;
- }
- } else {
- if (job.has_summary && !job.has_transcription) {
- html += `<a class="action" href="${hrefSum}">Sumário</a>`;
- } else if (job.has_transcription && !job.has_summary) {
- html += `<a class="action" href="${hrefTx}">Transcrição</a>`;
- } else if (job.has_transcription && job.has_summary) {
- html += `<a class="action" href="${hrefTx}">Transcrição completa</a>`;
- html += `<a class="action" href="${hrefSum}">Sumário</a>`;
- } else if (job.result_path) {
- html += `<a class="action" href="${hrefDl}">Download</a>`;
- }
+ if (!job.has_transcription && !job.has_summary && job.result_path) {
+ html += `<a class="action" href="${hrefDl}">Download</a>`;
}
html += `<button type="button" class="action" onclick="viewJobDetails('${id}')">Visualizar</button></div>`;
@@ -533,7 +506,12 @@
function updateJob(jobId, data) {
const node = document.getElementById(`job-${jobId}`);
if (!node) return;
- node.innerHTML = renderJob({ id: jobId, title: node.dataset.title || undefined, ...data });
+ let title = node.dataset.title || '';
+ if (data.title) {
+ title = data.title;
+ node.dataset.title = data.title;
+ }
+ node.innerHTML = renderJob({ id: jobId, title: title || undefined, ...data });
}
async function startPolling(jobId) {
@@ -600,14 +578,8 @@
if (!response.ok) throw new Error(data.detail || 'Erro ao carregar detalhes');
document.getElementById('previewTitle').textContent = data.metadata?.title || `Job ${jobId}`;
let content = '';
- const summarizeFirst = data.mode === 'summarize';
- if (summarizeFirst) {
- if (data.summary) { content += '<h3>Sumário</h3>' + renderPreview(data.summary, data.format) + '<hr>'; }
- if (data.transcription) { content += '<h3>Transcrição</h3>' + renderPreview(data.transcription, data.format); }
- } else {
- if (data.transcription) { content += '<h3>Transcrição</h3>' + renderPreview(data.transcription, data.format) + '<hr>'; }
- if (data.summary) { content += '<h3>Sumário</h3>' + renderPreview(data.summary, data.format); }
- }
+ if (data.summary) { content += '<h3>Sumário</h3>' + renderPreview(data.summary, data.format) + '<hr>'; }
+ if (data.transcription) { content += '<h3>Transcrição</h3>' + renderPreview(data.transcription, data.format); }
document.getElementById('previewContent').innerHTML = content || '<div class="empty">Sem conteúdo persistido.</div>';
document.getElementById('previewModal').classList.add('open');
} catch (error) {
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -34,14 +34,14 @@ class ApiTests(unittest.TestCase):
def tearDown(self):
shutil.rmtree(self.temp_dir, ignore_errors=True)
- def test_process_rejects_legacy_double_mode(self):
+ def test_process_unified_when_both_legacy_flags(self):
response = self.client.post(
"/api/process",
json={"url": "https://example.com/page", "format": "txt", "transcribe": True, "summarize": True},
)
- self.assertEqual(response.status_code, 400)
- self.assertIn("nao aceita mais os dois modos", response.json()["detail"])
+ self.assertEqual(response.status_code, 200)
+ self.assertEqual(response.json().get("mode"), "process")
def test_upload_persists_job_and_downloads_from_store(self):
output_dir = Path(os.environ["LAZIER_OUTPUT_DIR"]) / "2026" / "03" / "31" / "sample-job"
diff --git a/tests/test_diarization.py b/tests/test_diarization.py
@@ -0,0 +1,31 @@
+"""Testes da fusão texto + intervalos de falantes."""
+
+import unittest
+
+from lazier.diarization import merge_transcript_with_speakers
+
+
+class DiarizationMergeTests(unittest.TestCase):
+ def test_merge_groups_consecutive_segments_per_speaker(self):
+ intervals = [
+ {"start": 0.0, "end": 10.0, "speaker_id": "A"},
+ {"start": 10.0, "end": 20.0, "speaker_id": "B"},
+ ]
+ segments = [
+ {"start": 1.0, "end": 3.0, "text": "Ola"},
+ {"start": 4.0, "end": 6.0, "text": "mundo"},
+ {"start": 11.0, "end": 14.0, "text": "Segundo falante"},
+ ]
+ out = merge_transcript_with_speakers("", segments, intervals)
+ self.assertIn("**Falante 1**", out)
+ self.assertIn("**Falante 2**", out)
+ self.assertIn("Ola mundo", out)
+ self.assertIn("Segundo falante", out)
+
+ def test_merge_without_intervals_returns_raw(self):
+ raw = "Apenas texto."
+ self.assertEqual(merge_transcript_with_speakers(raw, [], []), raw)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_jobs.py b/tests/test_jobs.py
@@ -61,4 +61,4 @@ class JobStoreTests(unittest.TestCase):
self.assertIn("03", str(output_dir))
self.assertIn("31", str(output_dir))
self.assertTrue(output_dir.name.startswith("minha-reuniao-abc12345"))
- self.assertEqual(summary_path.name, "sumario.txt")
+ self.assertRegex(summary_path.name, r"^lazier_\d{6}\.txt$")
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -30,8 +30,15 @@ class ProcessingTests(unittest.TestCase):
audio_path.write_bytes(b"fake-audio")
with patch("lazier.core.processing.transcribe_audio", return_value="Hello world"), patch(
+ "lazier.core.processing.maybe_enrich_transcript_with_diarization",
+ lambda _p, raw, segs, _md: (raw, segs),
+ ), patch(
+ "lazier.core.processing.polish_pt_br_text", side_effect=lambda text, **kwargs: text
+ ), patch(
"lazier.core.processing.render_text_in_portuguese", return_value="Olá mundo"
), patch(
+ "lazier.core.processing.summarize_text", return_value="Resumo do áudio."
+ ), patch(
"lazier.core.processing.detect_content_type",
return_value={"content_type": "podcast", "confidence": 0.9, "rationale": ""},
):
@@ -47,15 +54,23 @@ class ProcessingTests(unittest.TestCase):
)
self.assertEqual(result["transcription"], "Olá mundo")
+ self.assertEqual(result["summary"], "Resumo do áudio.")
self.assertEqual(result["content_type"], "podcast")
- self.assertTrue(result["result_path"].endswith("transcricao.txt"))
- self.assertTrue(Path(result["result_path"]).exists())
+ self.assertIsNotNone(result["transcription_path"])
+ self.assertIsNotNone(result["summary_path"])
+ self.assertRegex(Path(result["transcription_path"]).name, r"^lazier_\d{6}\.txt$")
+ self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.txt$")
+ self.assertEqual(result["result_path"], result["summary_path"])
+ self.assertTrue(Path(result["transcription_path"]).exists())
+ self.assertTrue(Path(result["summary_path"]).exists())
def test_text_summarize_generates_summary_file_legacy(self):
text_path = self.temp_dir / "article.txt"
text_path.write_text("This is a long article in English.", encoding="utf-8")
with patch(
+ "lazier.core.processing.polish_pt_br_text", side_effect=lambda text, **kwargs: text
+ ), patch(
"lazier.core.processing.render_text_in_portuguese",
return_value="Este é um artigo longo em português.",
), patch(
@@ -80,8 +95,12 @@ class ProcessingTests(unittest.TestCase):
self.assertEqual(result["transcription"], "Este é um artigo longo em português.")
self.assertEqual(result["content_type"], "tech_doc")
self.assertIsNone(result["smart_summary"])
- self.assertTrue(result["result_path"].endswith("sumario.txt"))
- self.assertTrue(Path(result["result_path"]).exists())
+ self.assertIsNotNone(result["transcription_path"])
+ self.assertIsNotNone(result["summary_path"])
+ self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.txt$")
+ self.assertEqual(result["result_path"], result["summary_path"])
+ self.assertTrue(Path(result["transcription_path"]).exists())
+ self.assertTrue(Path(result["summary_path"]).exists())
def test_text_summarize_uses_smart_summary_when_enabled(self):
text_path = self.temp_dir / "smart.txt"
@@ -98,6 +117,8 @@ class ProcessingTests(unittest.TestCase):
}
with patch(
+ "lazier.core.processing.polish_pt_br_text", side_effect=lambda text, **kwargs: text
+ ), patch(
"lazier.core.processing.render_text_in_portuguese",
return_value="Texto convertido para portugues",
), patch(
@@ -125,7 +146,10 @@ class ProcessingTests(unittest.TestCase):
self.assertEqual(result["smart_summary"], smart_payload)
self.assertEqual(result["content_type"], "lecture")
self.assertIn("Resumo curto", result["summary"])
- self.assertTrue(result["result_path"].endswith("sumario.md"))
+ self.assertIsNotNone(result["transcription_path"])
+ self.assertIsNotNone(result["summary_path"])
+ self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.md$")
+ self.assertEqual(result["result_path"], result["summary_path"])
if __name__ == "__main__":