lazier

personal summarizer
Log | Files | Refs | README

commit 30c86b97c78ea073da753b26ce14c3aa9ddc89bc
parent 130c5b6924ead47b5fd9865bcd19e4dd359d0bdf
Author: Pablo Murad <pblmrd@gmail.com>
Date:   Sat,  9 May 2026 22:51:39 -0300

Mariela Boca Murcha

Diffstat:
Mlazier/api/routes.py | 42+++++++++++++++++++++++++++++++-----------
Mlazier/cli.py | 49++++++++++++++++++++++++++++++++++++++++++-------
Mlazier/core/config.py | 4++++
Mlazier/core/jobs.py | 19+++++++++++--------
Mlazier/core/processing.py | 278++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
Alazier/diarization.py | 153+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mlazier/summarizer.py | 83++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mlazier/web/templates/index.html | 78+++++++++++++++++++++++++-----------------------------------------------------
Mtests/test_api.py | 6+++---
Atests/test_diarization.py | 31+++++++++++++++++++++++++++++++
Mtests/test_jobs.py | 2+-
Mtests/test_processing.py | 34+++++++++++++++++++++++++++++-----
12 files changed, 654 insertions(+), 125 deletions(-)

diff --git a/lazier/api/routes.py b/lazier/api/routes.py @@ -17,7 +17,7 @@ from pydantic import BaseModel from ..audio_processor import extract_audio_from_video from ..core.formats import export -from ..core.jobs import build_job_artifact_path, get_job_store, slugify_source_name +from ..core.jobs import build_job_artifact_path, get_job_store, lazier_download_filename from ..core.processing import process_source from ..core.supported_sites import SUPPORTED_VIDEO_SITES from ..core.playlist import is_playlist_url @@ -52,22 +52,24 @@ def _resolve_mode( summarize: Optional[bool], ) -> str: if mode: - if mode not in {"transcribe", "summarize"}: - raise HTTPException(status_code=400, detail="Modo invalido. Use 'transcribe' ou 'summarize'.") + if mode not in {"transcribe", "summarize", "process"}: + raise HTTPException( + status_code=400, + detail="Modo invalido. Use 'process' (padrao), 'transcribe' ou 'summarize'.", + ) return mode if transcribe is None and summarize is None: - raise HTTPException(status_code=400, detail="Informe `mode`.") + return "process" + if transcribe and summarize: + return "process" if transcribe and not summarize: return "transcribe" if summarize and not transcribe: return "summarize" - raise HTTPException( - status_code=400, - detail="A combinacao legada de transcribe/summarize nao aceita mais os dois modos ao mesmo tempo.", - ) + return "process" def _job_title(job: dict) -> str: @@ -86,10 +88,24 @@ def _job_title(job: dict) -> str: def _progress_updater(job_id: str): store = get_job_store() - def callback(progress: int, status: str, message: Optional[str] = None) -> None: + def callback( + progress: int, + status: str, + message: Optional[str] = None, + *, + metadata_patch: Optional[dict] = None, + ) -> None: updates = {"progress": progress, "status": status} if status == "failed": updates["error"] = message + if metadata_patch: + job_row = store.get_job(job_id) + if job_row: + base = dict(job_row.get("metadata") or {}) + for key in ("title", "webpage_url"): + if key in metadata_patch and metadata_patch[key] is not None: + base[key] = metadata_patch[key] + updates["metadata"] = base store.update_job(job_id, **updates) broadcast_progress(job_id, progress, status, message) @@ -383,6 +399,11 @@ async def get_job_status(job_id: str): "mode": job["mode"], "status": job["status"], "progress": job.get("progress", 0), + "title": _job_title(job), + "url": job.get("source_url"), + "file_path": job.get("file_path"), + "format": job.get("format"), + "created_at": job.get("created_at"), "result_path": job.get("result_path"), "transcription_path": job.get("transcription_path"), "summary_path": job.get("summary_path"), @@ -489,8 +510,7 @@ async def download_bundle(job_id: str, background_tasks: BackgroundTasks): tmp_path.unlink(missing_ok=True) raise - slug = slugify_source_name(job.get("source_name")) - zip_filename = f"{slug}-{job_id[:8]}-tudo.zip" + zip_filename = lazier_download_filename("zip") background_tasks.add_task(_unlink_quiet, str(tmp_path)) return FileResponse(str(tmp_path), media_type="application/zip", filename=zip_filename) diff --git a/lazier/cli.py b/lazier/cli.py @@ -31,7 +31,7 @@ console = Console() def _progress_notifier(progress_bar, task_id): - def callback(progress: int, _status: str, message: Optional[str] = None): + def callback(progress: int, _status: str, message: Optional[str] = None, *, metadata_patch=None): if message: progress_bar.update(task_id, description=f"[cyan]{message}") progress_bar.update(task_id, completed=progress) @@ -106,7 +106,11 @@ def _run_mode( sys.exit(1) console.print(f"\n[bold green]✓ Processamento concluido![/bold green]") - console.print(f"[cyan]Arquivo gerado:[/cyan] {result['result_path']}") + console.print(f"[cyan]Arquivo principal:[/cyan] {result['result_path']}") + if result.get("transcription_path") and result["transcription_path"] != result.get("result_path"): + console.print(f"[cyan]Transcricao:[/cyan] {result['transcription_path']}") + if result.get("summary_path") and result["summary_path"] != result.get("result_path"): + console.print(f"[cyan]Sumario:[/cyan] {result['summary_path']}") if result.get("content_type"): console.print(f"[cyan]Tipo detectado:[/cyan] {result['content_type']}") if result.get("chapters"): @@ -125,9 +129,9 @@ def cli(ctx, input_path): if input_path: console.print( Panel.fit( - "Use comandos explicitos:\n\n" - "`lazier transcribe <input>` para transcrever em portugues\n" - "`lazier summarize <input>` para gerar um sumario em portugues", + "Fluxo unificado: transcrição em PT-BR + sumário (ajuste com LAZIER_ALWAYS_SUMMARY).\n\n" + "`lazier process <input>` — fluxo unificado explícito.\n" + "`lazier transcribe` / `lazier summarize` — modos legados (ver LAZIER_ALWAYS_SUMMARY).", title="Modo de Uso", ) ) @@ -182,7 +186,7 @@ def transcribe( chapters_flag: Optional[bool], reasoning: Optional[str], ): - """Transcreve ou converte o conteudo para portugues.""" + """Converte/transcreve para PT-BR; com LAZIER_ALWAYS_SUMMARY=true também gera sumário.""" _run_mode( input_path=input_path, mode="transcribe", @@ -211,7 +215,7 @@ def summarize( chapters_flag: Optional[bool], reasoning: Optional[str], ): - """Gera um sumario em portugues do conteudo informado.""" + """Foco em sumário; com LAZIER_ALWAYS_SUMMARY=true também exporta transcrição completa.""" _run_mode( input_path=input_path, mode="summarize", @@ -226,6 +230,35 @@ def summarize( @cli.command() +@click.argument("input_path", type=str) +@click.option("--output", "-o", type=str, help="Nome do arquivo de saida") +@click.option("--format", "-f", "format_type", type=click.Choice(["docx", "txt", "md", "json", "pdf"]), default="docx", help="Formato de saida") +@_model_options +def process( + input_path: str, + output: Optional[str], + format_type: str, + model: Optional[str], + gpt_model: Optional[str], + smart_flag: Optional[bool], + chapters_flag: Optional[bool], + reasoning: Optional[str], +): + """Executa o pipeline unificado (transcrição + sumário em PT-BR).""" + _run_mode( + input_path=input_path, + mode="process", + output=output, + format_type=format_type, + model=model, + gpt_model=gpt_model, + smart=smart_flag, + chapters=chapters_flag, + reasoning=reasoning, + ) + + +@cli.command() def config(): """Mostra a configuracao corrente de modelos.""" cfg = get_model_config(refresh=True) @@ -240,6 +273,8 @@ def config(): f"Capitulos: [cyan]{'on' if cfg.enable_chapters else 'off'}[/cyan]\n" f"Preset qualidade: [cyan]{cfg.quality_preset}[/cyan]\n" f"Sumario hierarquico: [cyan]{'on' if cfg.hierarchical_summary else 'off'}[/cyan]\n" + f"Sempre gerar sumario (LAZIER_ALWAYS_SUMMARY): [cyan]{'on' if cfg.always_summary else 'off'}[/cyan]\n" + f"Diarizacao (LAZIER_DIARIZATION_PROVIDER): [cyan]{cfg.diarization_provider or 'none'}[/cyan]\n" f"STT alternativo (flag): [cyan]{'on' if cfg.alt_stt_enabled else 'off'}[/cyan]", title="Lazier - Config Atual", ) diff --git a/lazier/core/config.py b/lazier/core/config.py @@ -107,6 +107,8 @@ class ModelConfig: summary_map_chunk_chars: int = 14_000 summary_chunk_overlap_chars: int = 1_400 alt_stt_enabled: bool = False + always_summary: bool = True + diarization_provider: str = "none" def supports_reasoning(self, model: Optional[str] = None) -> bool: """Indica se devemos enviar `reasoning_effort` para um modelo.""" @@ -153,6 +155,8 @@ def get_model_config(refresh: bool = False) -> ModelConfig: summary_map_chunk_chars=_env_int("LAZIER_SUMMARY_MAP_CHUNK_CHARS", 14_000, min_value=2_000), summary_chunk_overlap_chars=_env_int("LAZIER_SUMMARY_CHUNK_OVERLAP_CHARS", 1_400, min_value=0), alt_stt_enabled=_env_bool("LAZIER_ALT_STT_ENABLED", False), + always_summary=_env_bool("LAZIER_ALWAYS_SUMMARY", True), + diarization_provider=_env_str("LAZIER_DIARIZATION_PROVIDER", "none").lower(), ) return _cached_config diff --git a/lazier/core/jobs.py b/lazier/core/jobs.py @@ -5,6 +5,7 @@ Persistencia de jobs e organizacao de arquivos de saida. import json import os import re +import secrets import sqlite3 import threading import unicodedata @@ -14,11 +15,14 @@ from typing import Any, Dict, List, Optional, Union TERMINAL_STATUSES = {"completed", "failed", "interrupted"} -ARTIFACT_FILENAMES = { - "transcription": "transcricao", - "summary": "sumario", - "result": "resultado", -} +ARTIFACT_KINDS = frozenset({"transcription", "summary", "result"}) + + +def lazier_download_filename(format_type: str) -> str: + """Nome estável para downloads: lazier_<6 dígitos aleatórios>.<ext> (sem espaços ou unicode).""" + n = secrets.randbelow(900_000) + 100_000 + ext = format_type.lstrip(".") + return f"lazier_{n}.{ext}" def _coerce_datetime(value: Optional[Union[str, datetime]]) -> datetime: @@ -72,7 +76,7 @@ def build_job_artifact_path( created_at: Optional[Union[str, datetime]] = None, output_root: Optional[Path] = None, ) -> Path: - if artifact_kind not in ARTIFACT_FILENAMES: + if artifact_kind not in ARTIFACT_KINDS: raise ValueError(f"Artefato nao suportado: {artifact_kind}") output_dir = build_job_output_dir( @@ -82,8 +86,7 @@ def build_job_artifact_path( output_root=output_root, ) output_dir.mkdir(parents=True, exist_ok=True) - filename = ARTIFACT_FILENAMES[artifact_kind] - return output_dir / f"{filename}.{format_type}" + return output_dir / lazier_download_filename(format_type) class JobStore: diff --git a/lazier/core/processing.py b/lazier/core/processing.py @@ -5,11 +5,12 @@ Inclui as etapas de: - Validacao de input - Download (YouTube/web) ou preparacao de audio - Transcricao (com ou sem timestamps) -- Conversao para PT-BR +- Diarizacao opcional (LAZIER_DIARIZATION_PROVIDER) antes da conversao para PT-BR +- Conversao para PT-BR + revisao ortografica leve (polish_pt_br_text) - Deteccao de tipo de conteudo -- Sumario (legado ou estruturado) +- Sumario (legado ou estruturado; LAZIER_ALWAYS_SUMMARY=false recupera fluxos antigos) - Capitulos com timestamps -- Export final +- Export final (dois artefactos quando transcrição + sumário e saida por job) """ from __future__ import annotations @@ -29,8 +30,10 @@ from .exceptions import MusicContentError from .jobs import build_job_artifact_path, get_outputs_root from ..audio_processor import prepare_audio_file from ..downloader import download_video_audio, download_youtube_audio +from ..diarization import maybe_enrich_transcript_with_diarization from ..summarizer import ( format_smart_summary_as_text, + polish_pt_br_text, render_text_in_portuguese, summarize_smart, summarize_text, @@ -40,16 +43,38 @@ from ..utils import cleanup_files, validate_input from ..web.extractor import extract_pdf_content, extract_text_file_content, extract_web_content from .formats import export -ProgressCallback = Optional[Callable[[int, str, Optional[str]], None]] +ProgressCallback = Optional[Callable[..., None]] MEDIA_INPUT_TYPES = {"audio", "video", "youtube"} _pipeline_log = logging.getLogger("lazier.pipeline") -def _notify(callback: ProgressCallback, progress: int, status: str, message: Optional[str] = None) -> None: +def _early_title_patch(metadata: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """Campos seguros para mostrar título cedo na WebGUI (merge na metadata do job).""" + title = metadata.get("title") + if not title: + return None + patch: Dict[str, Any] = {"title": str(title)} + wu = metadata.get("webpage_url") + if wu: + patch["webpage_url"] = str(wu) + return patch + + +def _notify( + callback: ProgressCallback, + progress: int, + status: str, + message: Optional[str] = None, + *, + metadata_patch: Optional[Dict[str, Any]] = None, +) -> None: if callback: - callback(progress, status, message) + if metadata_patch: + callback(progress, status, message, metadata_patch=metadata_patch) + else: + callback(progress, status, message) def _get_cache_manager_safe(): @@ -68,11 +93,26 @@ def _ensure_api_key() -> None: def _ensure_mode(mode: str) -> str: - if mode not in {"transcribe", "summarize"}: - raise ValueError("Modo invalido. Use 'transcribe' ou 'summarize'.") + if mode not in {"transcribe", "summarize", "process"}: + raise ValueError("Modo invalido. Use 'transcribe', 'summarize' ou 'process'.") return mode +def _maybe_polish_pt_br(text: Optional[str], kind: str, runtime: Dict[str, Any]) -> Optional[str]: + if not text or not str(text).strip(): + return text + try: + return polish_pt_br_text( + text, + kind=kind, + model=runtime["chat_model"], + reasoning_effort=runtime["reasoning_effort"], + ) + except Exception as exc: + _pipeline_log.warning("polish_pt_br falhou (%s): %s", kind, exc) + return text + + def _pipeline_trace(job_id: Optional[str], step: str, **fields: Any) -> None: if not job_id: return @@ -95,28 +135,42 @@ def _resolve_runtime( "chapters_enabled": config.enable_chapters if use_chapters is None else use_chapters, "reasoning_effort": config.reasoning_effort, "quality_preset": config.quality_preset, + "always_summary": config.always_summary, } -def _export_selected_artifact( +def _export_single_user_path( mode: str, format_type: str, output_path: str, transcription: str, summary: Optional[str], metadata: Dict[str, Any], + *, + prefer_summary: bool, ) -> str: - if mode == "transcribe": + """Um único ficheiro (-o / CLI): documento completo quando há transcrição e sumário.""" + has_tx = bool(transcription and transcription.strip()) + has_sm = bool(summary and str(summary).strip()) + if has_tx and has_sm: return export( transcription=transcription, - summary=None, + summary=summary, + metadata=metadata, + output_path=output_path, + format_type=format_type, + ) + if has_sm and (not has_tx or prefer_summary): + return export( + transcription="", + summary=summary, metadata=metadata, output_path=output_path, format_type=format_type, ) return export( - transcription="", - summary=summary, + transcription=transcription or "", + summary=None, metadata=metadata, output_path=output_path, format_type=format_type, @@ -166,6 +220,13 @@ def _transcribe_media( _notify(progress_callback, progress_start, "processing", "Transcrevendo audio...") raw_text = transcribe_audio(audio_file, language=None, model=runtime["transcribe_model"]) + raw_text, segments = maybe_enrich_transcript_with_diarization( + audio_file, + raw_text, + segments, + metadata, + ) + _notify(progress_callback, midpoint, "processing", "Convertendo conteudo para portugues...") portuguese_text = render_text_in_portuguese( raw_text, @@ -376,6 +437,7 @@ def process_source( reasoning_effort=runtime["reasoning_effort"], smart_summary=runtime["smart_summary"], chapters_enabled=runtime["chapters_enabled"], + always_summary=runtime["always_summary"], ) if input_type == "youtube": @@ -390,11 +452,24 @@ def process_source( metadata["smart_summary"] = cached["smart_summary"] if cached.get("chapters"): metadata["chapters"] = cached["chapters"] - _notify(progress_callback, 70, "processing", "Conteudo do YouTube encontrado no cache") + _notify( + progress_callback, + 70, + "processing", + "Conteudo do YouTube encontrado no cache", + metadata_patch=_early_title_patch(metadata), + ) else: _notify(progress_callback, 15, "processing", "Baixando video do YouTube...") audio_file, metadata = download_youtube_audio(source) files_to_cleanup.append(audio_file) + _notify( + progress_callback, + 17, + "processing", + "Video obtido; a transcrever...", + metadata_patch=_early_title_patch(metadata), + ) portuguese_text, segments = _transcribe_media( audio_file, runtime=runtime, @@ -409,6 +484,13 @@ def process_source( _notify(progress_callback, 15, "processing", "Tentando extrair audio da URL...") audio_file, metadata = download_video_audio(source) files_to_cleanup.append(audio_file) + _notify( + progress_callback, + 16, + "processing", + "Audio da pagina obtido.", + metadata_patch=_early_title_patch(metadata), + ) cached = cache.get("video", url_hash) if cache else None if cached and cached.get("transcription"): metadata = cached.get("metadata", metadata) or metadata @@ -419,7 +501,13 @@ def process_source( metadata["smart_summary"] = cached["smart_summary"] if cached.get("chapters"): metadata["chapters"] = cached["chapters"] - _notify(progress_callback, 70, "processing", "Conteudo de video encontrado no cache") + _notify( + progress_callback, + 70, + "processing", + "Conteudo de video encontrado no cache", + metadata_patch=_early_title_patch(metadata), + ) else: portuguese_text, segments = _transcribe_media( audio_file, @@ -438,7 +526,13 @@ def process_source( summary = cached.get("summary") if cached.get("smart_summary"): metadata["smart_summary"] = cached["smart_summary"] - _notify(progress_callback, 70, "processing", "Conteudo web encontrado no cache") + _notify( + progress_callback, + 70, + "processing", + "Conteudo web encontrado no cache", + metadata_patch=_early_title_patch(metadata), + ) else: _notify(progress_callback, 20, "processing", "Extraindo texto da pagina web...") content_data = extract_web_content(source) @@ -446,6 +540,13 @@ def process_source( "title": content_data.get("title", "Pagina Web"), "webpage_url": source, } + _notify( + progress_callback, + 22, + "processing", + "Pagina obtida.", + metadata_patch=_early_title_patch(metadata), + ) portuguese_text = render_text_in_portuguese( content_data["content"], model=runtime["chat_model"], @@ -476,6 +577,13 @@ def process_source( "title": content_data.get("title", "Documento"), "file_path": source, } + _notify( + progress_callback, + 22, + "processing", + "Documento lido.", + metadata_patch=_early_title_patch(metadata), + ) portuguese_text = render_text_in_portuguese( content_data["content"], model=runtime["chat_model"], @@ -490,6 +598,9 @@ def process_source( # ---- Etapas pos-transcricao comuns a todos os tipos ---- if portuguese_text: + portuguese_text = _maybe_polish_pt_br(portuguese_text, "transcription", runtime) or "" + + if portuguese_text: _detect_and_attach_content_type(portuguese_text, metadata, runtime) chapters = _build_chapters_if_possible( @@ -499,7 +610,13 @@ def process_source( progress_callback=progress_callback, ) - if mode == "summarize" and not summary: + always_sum = runtime["always_summary"] + text_for_summary = (portuguese_text or "").strip() + should_build_summary = bool(text_for_summary) and ( + always_sum or mode in {"summarize", "process"} + ) + + if should_build_summary and not summary: summary = _build_summary( portuguese_text or "", metadata=metadata, @@ -507,7 +624,7 @@ def process_source( progress_callback=progress_callback, trace_job_id=trace_id, ) - elif mode == "summarize" and summary and runtime["smart_summary"] and not metadata.get("smart_summary"): + elif should_build_summary and summary and runtime["smart_summary"] and not metadata.get("smart_summary"): # Cache antigo guardou apenas summary textual; reaproveitamos como tldr. metadata["smart_summary"] = { "tldr": summary[:600], @@ -519,6 +636,9 @@ def process_source( "open_questions": [], } + if summary: + summary = _maybe_polish_pt_br(summary, "summary", runtime) + # ---- Atualiza caches "ricos" para fontes de midia ---- if cache and input_type == "youtube": cache.set( @@ -563,32 +683,118 @@ def process_source( resolved_source_name = source_name or metadata.get("title") or source + has_tx = bool(portuguese_text and str(portuguese_text).strip()) + has_sm = bool(summary and str(summary).strip()) + dual_artifacts = ( + not output_path + and has_tx + and has_sm + and (runtime["always_summary"] or mode == "process") + ) + + _notify(progress_callback, 92, "processing", f"Gerando arquivo {output_format.upper()}...") + + transcription_path: Optional[str] = None + summary_path: Optional[str] = None + result_path: Optional[str] = None + if output_path: final_output_path = Path(output_path) if final_output_path.suffix != f".{output_format}": final_output_path = final_output_path.with_suffix(f".{output_format}") - else: - artifact_kind = "transcription" if mode == "transcribe" else "summary" - final_output_path = build_job_artifact_path( + exported_path = _export_single_user_path( + mode, + output_format, + str(final_output_path), + portuguese_text or "", + summary, + metadata, + prefer_summary=(mode == "summarize"), + ) + result_path = exported_path + if has_tx and has_sm: + transcription_path = None + summary_path = None + elif has_sm: + transcription_path = None + summary_path = exported_path + else: + transcription_path = exported_path + summary_path = None + elif dual_artifacts: + tx_path = build_job_artifact_path( job_id=run_id, source_name=resolved_source_name, format_type=output_format, - artifact_kind=artifact_kind, + artifact_kind="transcription", created_at=created_at, output_root=output_root, ) + sm_path = build_job_artifact_path( + job_id=run_id, + source_name=resolved_source_name, + format_type=output_format, + artifact_kind="summary", + created_at=created_at, + output_root=output_root, + ) + export( + transcription=portuguese_text or "", + summary=None, + metadata=metadata, + output_path=str(tx_path), + format_type=output_format, + ) + export( + transcription="", + summary=summary, + metadata=metadata, + output_path=str(sm_path), + format_type=output_format, + ) + transcription_path = str(tx_path) + summary_path = str(sm_path) + result_path = summary_path or transcription_path + elif has_sm: + sm_only = build_job_artifact_path( + job_id=run_id, + source_name=resolved_source_name, + format_type=output_format, + artifact_kind="summary", + created_at=created_at, + output_root=output_root, + ) + exported_path = export( + transcription="", + summary=summary, + metadata=metadata, + output_path=str(sm_only), + format_type=output_format, + ) + result_path = exported_path + summary_path = exported_path + elif has_tx: + tx_only = build_job_artifact_path( + job_id=run_id, + source_name=resolved_source_name, + format_type=output_format, + artifact_kind="transcription", + created_at=created_at, + output_root=output_root, + ) + exported_path = export( + transcription=portuguese_text or "", + summary=None, + metadata=metadata, + output_path=str(tx_only), + format_type=output_format, + ) + result_path = exported_path + transcription_path = exported_path + else: + raise Exception("Nenhum conteudo gerado para exportar.") - _notify(progress_callback, 92, "processing", f"Gerando arquivo {output_format.upper()}...") - exported_path = _export_selected_artifact( - mode=mode, - format_type=output_format, - output_path=str(final_output_path), - transcription=portuguese_text or "", - summary=summary, - metadata=metadata, - ) - - output_dir = str(Path(exported_path).parent) + output_dir = str(Path(result_path or ".").parent) result = { "mode": mode, "input_type": input_type, @@ -599,9 +805,9 @@ def process_source( "smart_summary": metadata.get("smart_summary"), "chapters": chapters, "content_type": metadata.get("content_type"), - "result_path": exported_path, - "transcription_path": exported_path if mode == "transcribe" else None, - "summary_path": exported_path if mode == "summarize" else None, + "result_path": result_path, + "transcription_path": transcription_path, + "summary_path": summary_path, "output_dir": output_dir, } _pipeline_trace( diff --git a/lazier/diarization.py b/lazier/diarization.py @@ -0,0 +1,153 @@ +""" +Diarização opcional de falantes após STT. + +Provedores futuros (AssemblyAI, pyannote, etc.) devem preencher intervalos +``{start, end, speaker_id}`` em segundos; o texto é reorganizado em blocos +``**Falante N**`` alinhados aos segmentos do Whisper quando possível. +""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional, Tuple + +from .core.config import get_model_config + +_log = logging.getLogger(__name__) + + +def _segment_midpoint_seconds(seg: Dict[str, Any]) -> float: + start = float(seg.get("start", 0) or 0) + end = float(seg.get("end", start) or start) + return (start + end) / 2.0 + + +def _speaker_key(interval: Dict[str, Any]) -> str: + sid = interval.get("speaker_id", interval.get("speaker")) + if sid is None: + return "0" + return str(sid) + + +def _build_speaker_rank(intervals: List[Dict[str, Any]]) -> Dict[str, int]: + """Mapeia identificador bruto do provedor para Falante 1..N (ordem de primeira ocorrência).""" + seen: List[str] = [] + for iv in intervals: + k = _speaker_key(iv) + if k not in seen: + seen.append(k) + return {k: i + 1 for i, k in enumerate(seen)} + + +def _speaker_num_at_time( + t: float, intervals: List[Dict[str, Any]], rank: Dict[str, int] +) -> int: + for iv in intervals: + try: + s = float(iv.get("start", 0)) + e = float(iv.get("end", s)) + except (TypeError, ValueError): + continue + if s <= t <= e: + k = _speaker_key(iv) + return rank.get(k, 1) + return 1 + + +def merge_transcript_with_speakers( + raw_text: str, + segments: List[Dict[str, Any]], + intervals: List[Dict[str, Any]], +) -> str: + """Junta texto bruto ou segmentos Whisper com rótulos **Falante N** (blocos consecutivos).""" + if not intervals: + return raw_text + + rank = _build_speaker_rank(intervals) + + if segments: + parts: List[str] = [] + current_sp: Optional[int] = None + buffer: List[str] = [] + + def flush() -> None: + nonlocal buffer, current_sp + if current_sp is None or not buffer: + return + text = " ".join(buffer).strip() + if text: + parts.append(f"**Falante {current_sp}**\n\n{text}") + buffer = [] + + for seg in segments: + text = (seg.get("text") or "").strip() + if not text: + continue + sp = _speaker_num_at_time(_segment_midpoint_seconds(seg), intervals, rank) + if current_sp is not None and sp != current_sp: + flush() + current_sp = sp + buffer.append(text) + flush() + return "\n\n".join(parts).strip() if parts else raw_text + + lines_out: List[str] = [] + current_sp = 1 + for para in raw_text.split("\n\n"): + p = para.strip() + if not p: + continue + lines_out.append(f"**Falante {current_sp}**\n\n{p}") + current_sp += 1 + return "\n\n".join(lines_out).strip() if lines_out else raw_text + + +def diarize_audio_intervals(audio_path: str, provider: str) -> List[Dict[str, Any]]: + """ + Executa diarização externa. Retorna lista vazia se desligado ou não implementado. + + Para integrar um provedor real, implemente aqui e defina LAZIER_DIARIZATION_PROVIDER. + """ + p = (provider or "").strip().lower() + if p in ("", "none", "off", "false"): + return [] + + if p in ("assemblyai", "deepgram", "google", "pyannote"): + _log.warning( + "Diarização com provedor '%s' ainda não está integrada; " + "use LAZIER_DIARIZATION_PROVIDER=none ou contribua com o conector.", + p, + ) + return [] + + _log.warning("LAZIER_DIARIZATION_PROVIDER='%s' desconhecido; ignorando.", provider) + return [] + + +def maybe_enrich_transcript_with_diarization( + audio_path: str, + raw_text: str, + segments: List[Dict[str, Any]], + metadata: Dict[str, Any], +) -> Tuple[str, List[Dict[str, Any]]]: + """ + Opcionalmente altera o texto bruto (pré PT-BR) com marcadores **Falante N**. + ``segments`` são os do Whisper (opcionalmente vazios). + """ + cfg = get_model_config() + provider = (cfg.diarization_provider or "none").strip().lower() + metadata["diarization_provider"] = provider + + if provider in ("", "none", "off", "false"): + metadata["diarization_status"] = "desligada" + return raw_text, segments + + intervals = diarize_audio_intervals(audio_path, provider) + if not intervals: + metadata["diarization_status"] = "indisponivel" + return raw_text, segments + + merged = merge_transcript_with_speakers(raw_text, segments, intervals) + metadata["diarization_status"] = "ativa" + metadata["diarization_segments"] = len(intervals) + return merged, segments diff --git a/lazier/summarizer.py b/lazier/summarizer.py @@ -238,7 +238,8 @@ def _render_portuguese_chunk(text: str, model: str, reasoning_effort: Optional[s "- Se o texto ja estiver em portugues, mantenha em portugues do Brasil natural.\n" "- Nao resuma, nao explique, nao comente o texto.\n" "- Preserve nomes proprios, numeros, datas, links, listas e estrutura.\n" - "- Mantenha o maximo de fidelidade possivel ao conteudo original.\n\n" + "- Mantenha o maximo de fidelidade possivel ao conteudo original.\n" + "- Use acentuacao e ortografia corretas em portugues do Brasil.\n\n" "Texto:\n\n" ) @@ -261,6 +262,86 @@ def _render_portuguese_chunk(text: str, model: str, reasoning_effort: Optional[s raise _wrap_chat_error(exc, "converter texto para portugues") from exc +POLISH_PT_BR_CHUNK_CHARS = 24_000 + + +def polish_pt_br_text( + text: str, + kind: str = "transcription", + model: Optional[str] = None, + reasoning_effort: Optional[str] = None, +) -> str: + """ + Revisao ortografica leve em pt-BR: acentos, grafia, paragrafos. + Nao altera factos, nomes proprios, numeros, links nem marcadores **Falante N**. + """ + + if not text or not text.strip(): + return text or "" + + config = get_model_config() + chosen_model = model or config.chat_model + max_chars = min(POLISH_PT_BR_CHUNK_CHARS, DEFAULT_CHUNK_CHAR_LIMIT) + + kind_norm = (kind or "transcription").strip().lower() + if kind_norm == "summary": + kind_hint = ( + "Este trecho e um sumario ou sintese: mantenha tom conciso e bullet/numeracao se ja existir." + ) + else: + kind_hint = ( + "Este trecho e transcricao ou texto corrido: preserve a ordem e falas; " + "nao resuma nem omita frases." + ) + + if len(text) <= max_chars: + return _polish_pt_br_chunk(text, chosen_model, kind_hint, reasoning_effort=reasoning_effort) + + chunks = _split_text_into_chunks(text, max_chars) + out: List[str] = [] + for i, chunk in enumerate(chunks): + print(f"Revisao ortografica pt-BR: parte {i + 1}/{len(chunks)}...") + out.append(_polish_pt_br_chunk(chunk, chosen_model, kind_hint, reasoning_effort=reasoning_effort)) + return "\n\n".join(c.strip() for c in out if c.strip()) + + +def _polish_pt_br_chunk( + text: str, + model: str, + kind_hint: str, + reasoning_effort: Optional[str] = None, +) -> str: + client = _ensure_client() + prompt = ( + "Revise o texto em portugues do Brasil.\n\n" + f"{kind_hint}\n\n" + "Regras obrigatorias:\n" + "- Corrija acentuacao e ortografia em pt-BR.\n" + "- Melhore quebras de linha e paragrafos quando ajudar a leitura.\n" + "- Nao altere factos, dados, numeros, datas, nomes proprios, codigos, URLs.\n" + "- Preserve linhas que comecam com **Falante N** e a estrutura logo abaixo delas.\n" + "- Nao adicione comentarios meta; devolva apenas o texto revisado.\n\n" + "Texto:\n\n" + ) + try: + kwargs = _chat_completions_kwargs( + model=model, + messages=[ + { + "role": "system", + "content": "Voce e um revisor editorial em portugues do Brasil, preciso e conservador.", + }, + {"role": "user", "content": prompt + text}, + ], + temperature=0.05, + reasoning_effort=reasoning_effort, + ) + response = client.chat.completions.create(**kwargs) + return (response.choices[0].message.content or "").strip() + except Exception as exc: + raise _wrap_chat_error(exc, "revisar ortografia em portugues") from exc + + # --------------------------------------------------------------------------- # Sumario textual legado # --------------------------------------------------------------------------- diff --git a/lazier/web/templates/index.html b/lazier/web/templates/index.html @@ -319,7 +319,7 @@ <header class="topbar"> <div class="brand"> <h1>Lazier</h1> - <p>Transcrição ou sumário em português do Brasil.</p> + <p>Transcrição e sumário em português do Brasil em cada processamento.</p> </div> <nav class="nav"> <a href="#" class="active" onclick="showPage('process');return false;">Processar</a> @@ -329,18 +329,12 @@ <section id="page-process" class="page active"> <div class="section-title">Novo processamento</div> - <p class="section-lead">A saída final será em português do Brasil.</p> + <p class="section-lead">Gera transcrição completa e sumário em PT-BR; escolha no download o que quer guardar.</p> + <p class="subtle" style="margin-top:-8px;">Diarização de falantes é opcional (LAZIER_DIARIZATION_PROVIDER); sem ela, o texto não traz separação por falante.</p> <div class="stack"> <div class="upload" id="uploadArea"><input type="file" id="fileInput" hidden multiple accept="audio/*,video/*,application/pdf,text/plain,text/markdown,text/html,.md,.htm,.txt"></div> <div class="field"><label for="urlInput">URL (opcional)</label><input type="text" id="urlInput" placeholder="YouTube, página, áudio em linha…"></div> - <div class="field"> - <label>Modo</label> - <div class="mode-grid"> - <div class="mode selected" onclick="selectMode('transcribe',this)"><input type="radio" checked value="transcribe"><strong>Transcrever</strong><span class="meta">Texto completo em PT-BR.</span></div> - <div class="mode" onclick="selectMode('summarize',this)"><input type="radio" value="summarize"><strong>Resumir</strong><span class="meta">Sumário em PT-BR.</span></div> - </div> - </div> <div class="row-actions"> <div class="field"> <label for="formatSelect">Formato</label> @@ -389,7 +383,7 @@ <script> let selectedFiles = []; - let processingMode = 'transcribe'; + let processingMode = 'process'; let allJobs = []; let currentFilter = 'all'; @@ -420,12 +414,6 @@ } function removeFile(index) { selectedFiles.splice(index, 1); document.getElementById('fileInput').value = ''; renderUpload(); } - function selectMode(mode, element) { - processingMode = mode; - document.querySelectorAll('.mode').forEach((node) => node.classList.remove('selected')); - element.classList.add('selected'); - } - function showPage(page) { document.querySelectorAll('.page').forEach((node) => node.classList.remove('active')); document.querySelectorAll('.nav a').forEach((node) => node.classList.remove('active')); @@ -474,7 +462,10 @@ function renderJob(job) { const title = job.title || job.url || job.file_path || `Job ${job.id}`; const statusLabel = { pending:'Pendente', processing:'A processar', completed:'Concluído', failed:'Falhou', interrupted:'Interrompido' }[job.status] || job.status; - return `<div class="job-head"><div><div class="job-title">${escapeHtml(title)}</div><div class="chips">${job.mode ? `<span class="chip">${job.mode === 'transcribe' ? 'Transcrição' : 'Sumário'}</span>` : ''}${job.format ? `<span class="chip">${job.format.toUpperCase()}</span>` : ''}${job.created_at ? `<span class="chip">${new Date(job.created_at).toLocaleString('pt-BR')}</span>` : ''}</div></div><span class="status ${job.status}">${statusLabel}</span></div><div class="bar"><span style="width:${job.progress || 0}%"></span></div>${job.error ? `<div class="error">${escapeHtml(job.error)}</div>` : ''}${renderActions(job)}`; + const modeChip = job.mode === 'process' + ? 'Transcrição + sumário' + : (job.mode === 'transcribe' ? 'Transcrição' : (job.mode === 'summarize' ? 'Sumário' : '')); + return `<div class="job-head"><div><div class="job-title">${escapeHtml(title)}</div><div class="chips">${modeChip ? `<span class="chip">${modeChip}</span>` : ''}${job.format ? `<span class="chip">${job.format.toUpperCase()}</span>` : ''}${job.created_at ? `<span class="chip">${new Date(job.created_at).toLocaleString('pt-BR')}</span>` : ''}</div></div><span class="status ${job.status}">${statusLabel}</span></div><div class="bar"><span style="width:${job.progress || 0}%"></span></div>${job.error ? `<div class="error">${escapeHtml(job.error)}</div>` : ''}${renderActions(job)}`; } function renderActions(job) { @@ -486,35 +477,17 @@ const hrefDl = `/api/jobs/${id}/download`; const hrefBundle = `/api/jobs/${id}/download-bundle`; + if (job.has_transcription) { + html += `<a class="action" href="${hrefTx}">Transcrição</a>`; + } + if (job.has_summary) { + html += `<a class="action" href="${hrefSum}">Sumário</a>`; + } if (job.has_transcription && job.has_summary) { - html += `<a class="action" href="${hrefBundle}">Tudo</a>`; + html += `<a class="action" href="${hrefBundle}">Pacote ZIP</a>`; } - - if (job.mode === 'transcribe') { - if (job.has_transcription || job.result_path) { - html += `<a class="action" href="${hrefTx}">Transcrição</a>`; - } - if (job.has_summary) { - html += `<a class="action" href="${hrefSum}">Sumário</a>`; - } - } else if (job.mode === 'summarize') { - if (job.has_summary || job.result_path) { - html += `<a class="action" href="${hrefSum}">Sumário</a>`; - } - if (job.has_transcription) { - html += `<a class="action" href="${hrefTx}">Transcrição completa</a>`; - } - } else { - if (job.has_summary && !job.has_transcription) { - html += `<a class="action" href="${hrefSum}">Sumário</a>`; - } else if (job.has_transcription && !job.has_summary) { - html += `<a class="action" href="${hrefTx}">Transcrição</a>`; - } else if (job.has_transcription && job.has_summary) { - html += `<a class="action" href="${hrefTx}">Transcrição completa</a>`; - html += `<a class="action" href="${hrefSum}">Sumário</a>`; - } else if (job.result_path) { - html += `<a class="action" href="${hrefDl}">Download</a>`; - } + if (!job.has_transcription && !job.has_summary && job.result_path) { + html += `<a class="action" href="${hrefDl}">Download</a>`; } html += `<button type="button" class="action" onclick="viewJobDetails('${id}')">Visualizar</button></div>`; @@ -533,7 +506,12 @@ function updateJob(jobId, data) { const node = document.getElementById(`job-${jobId}`); if (!node) return; - node.innerHTML = renderJob({ id: jobId, title: node.dataset.title || undefined, ...data }); + let title = node.dataset.title || ''; + if (data.title) { + title = data.title; + node.dataset.title = data.title; + } + node.innerHTML = renderJob({ id: jobId, title: title || undefined, ...data }); } async function startPolling(jobId) { @@ -600,14 +578,8 @@ if (!response.ok) throw new Error(data.detail || 'Erro ao carregar detalhes'); document.getElementById('previewTitle').textContent = data.metadata?.title || `Job ${jobId}`; let content = ''; - const summarizeFirst = data.mode === 'summarize'; - if (summarizeFirst) { - if (data.summary) { content += '<h3>Sumário</h3>' + renderPreview(data.summary, data.format) + '<hr>'; } - if (data.transcription) { content += '<h3>Transcrição</h3>' + renderPreview(data.transcription, data.format); } - } else { - if (data.transcription) { content += '<h3>Transcrição</h3>' + renderPreview(data.transcription, data.format) + '<hr>'; } - if (data.summary) { content += '<h3>Sumário</h3>' + renderPreview(data.summary, data.format); } - } + if (data.summary) { content += '<h3>Sumário</h3>' + renderPreview(data.summary, data.format) + '<hr>'; } + if (data.transcription) { content += '<h3>Transcrição</h3>' + renderPreview(data.transcription, data.format); } document.getElementById('previewContent').innerHTML = content || '<div class="empty">Sem conteúdo persistido.</div>'; document.getElementById('previewModal').classList.add('open'); } catch (error) { diff --git a/tests/test_api.py b/tests/test_api.py @@ -34,14 +34,14 @@ class ApiTests(unittest.TestCase): def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) - def test_process_rejects_legacy_double_mode(self): + def test_process_unified_when_both_legacy_flags(self): response = self.client.post( "/api/process", json={"url": "https://example.com/page", "format": "txt", "transcribe": True, "summarize": True}, ) - self.assertEqual(response.status_code, 400) - self.assertIn("nao aceita mais os dois modos", response.json()["detail"]) + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json().get("mode"), "process") def test_upload_persists_job_and_downloads_from_store(self): output_dir = Path(os.environ["LAZIER_OUTPUT_DIR"]) / "2026" / "03" / "31" / "sample-job" diff --git a/tests/test_diarization.py b/tests/test_diarization.py @@ -0,0 +1,31 @@ +"""Testes da fusão texto + intervalos de falantes.""" + +import unittest + +from lazier.diarization import merge_transcript_with_speakers + + +class DiarizationMergeTests(unittest.TestCase): + def test_merge_groups_consecutive_segments_per_speaker(self): + intervals = [ + {"start": 0.0, "end": 10.0, "speaker_id": "A"}, + {"start": 10.0, "end": 20.0, "speaker_id": "B"}, + ] + segments = [ + {"start": 1.0, "end": 3.0, "text": "Ola"}, + {"start": 4.0, "end": 6.0, "text": "mundo"}, + {"start": 11.0, "end": 14.0, "text": "Segundo falante"}, + ] + out = merge_transcript_with_speakers("", segments, intervals) + self.assertIn("**Falante 1**", out) + self.assertIn("**Falante 2**", out) + self.assertIn("Ola mundo", out) + self.assertIn("Segundo falante", out) + + def test_merge_without_intervals_returns_raw(self): + raw = "Apenas texto." + self.assertEqual(merge_transcript_with_speakers(raw, [], []), raw) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_jobs.py b/tests/test_jobs.py @@ -61,4 +61,4 @@ class JobStoreTests(unittest.TestCase): self.assertIn("03", str(output_dir)) self.assertIn("31", str(output_dir)) self.assertTrue(output_dir.name.startswith("minha-reuniao-abc12345")) - self.assertEqual(summary_path.name, "sumario.txt") + self.assertRegex(summary_path.name, r"^lazier_\d{6}\.txt$") diff --git a/tests/test_processing.py b/tests/test_processing.py @@ -30,8 +30,15 @@ class ProcessingTests(unittest.TestCase): audio_path.write_bytes(b"fake-audio") with patch("lazier.core.processing.transcribe_audio", return_value="Hello world"), patch( + "lazier.core.processing.maybe_enrich_transcript_with_diarization", + lambda _p, raw, segs, _md: (raw, segs), + ), patch( + "lazier.core.processing.polish_pt_br_text", side_effect=lambda text, **kwargs: text + ), patch( "lazier.core.processing.render_text_in_portuguese", return_value="Olá mundo" ), patch( + "lazier.core.processing.summarize_text", return_value="Resumo do áudio." + ), patch( "lazier.core.processing.detect_content_type", return_value={"content_type": "podcast", "confidence": 0.9, "rationale": ""}, ): @@ -47,15 +54,23 @@ class ProcessingTests(unittest.TestCase): ) self.assertEqual(result["transcription"], "Olá mundo") + self.assertEqual(result["summary"], "Resumo do áudio.") self.assertEqual(result["content_type"], "podcast") - self.assertTrue(result["result_path"].endswith("transcricao.txt")) - self.assertTrue(Path(result["result_path"]).exists()) + self.assertIsNotNone(result["transcription_path"]) + self.assertIsNotNone(result["summary_path"]) + self.assertRegex(Path(result["transcription_path"]).name, r"^lazier_\d{6}\.txt$") + self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.txt$") + self.assertEqual(result["result_path"], result["summary_path"]) + self.assertTrue(Path(result["transcription_path"]).exists()) + self.assertTrue(Path(result["summary_path"]).exists()) def test_text_summarize_generates_summary_file_legacy(self): text_path = self.temp_dir / "article.txt" text_path.write_text("This is a long article in English.", encoding="utf-8") with patch( + "lazier.core.processing.polish_pt_br_text", side_effect=lambda text, **kwargs: text + ), patch( "lazier.core.processing.render_text_in_portuguese", return_value="Este é um artigo longo em português.", ), patch( @@ -80,8 +95,12 @@ class ProcessingTests(unittest.TestCase): self.assertEqual(result["transcription"], "Este é um artigo longo em português.") self.assertEqual(result["content_type"], "tech_doc") self.assertIsNone(result["smart_summary"]) - self.assertTrue(result["result_path"].endswith("sumario.txt")) - self.assertTrue(Path(result["result_path"]).exists()) + self.assertIsNotNone(result["transcription_path"]) + self.assertIsNotNone(result["summary_path"]) + self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.txt$") + self.assertEqual(result["result_path"], result["summary_path"]) + self.assertTrue(Path(result["transcription_path"]).exists()) + self.assertTrue(Path(result["summary_path"]).exists()) def test_text_summarize_uses_smart_summary_when_enabled(self): text_path = self.temp_dir / "smart.txt" @@ -98,6 +117,8 @@ class ProcessingTests(unittest.TestCase): } with patch( + "lazier.core.processing.polish_pt_br_text", side_effect=lambda text, **kwargs: text + ), patch( "lazier.core.processing.render_text_in_portuguese", return_value="Texto convertido para portugues", ), patch( @@ -125,7 +146,10 @@ class ProcessingTests(unittest.TestCase): self.assertEqual(result["smart_summary"], smart_payload) self.assertEqual(result["content_type"], "lecture") self.assertIn("Resumo curto", result["summary"]) - self.assertTrue(result["result_path"].endswith("sumario.md")) + self.assertIsNotNone(result["transcription_path"]) + self.assertIsNotNone(result["summary_path"]) + self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.md$") + self.assertEqual(result["result_path"], result["summary_path"]) if __name__ == "__main__":