lazier

personal summarizer
Log | Files | Refs | README

commit 84a9c327bcf4288fbafdcad5315dcf277bae74be
parent b1738d8d282ca60a93ad65b88b9f0959537ad129
Author: Pablo Murad <pblmrd@gmail.com>
Date:   Fri, 29 May 2026 19:14:46 -0300

xerxes

Diffstat:
M.env.example | 54++++++++++++++++++++++++++++++------------------------
MREADME.md | 66++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
Mlazier/api/routes.py | 251+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Mlazier/audio_processor.py | 47++++++++++++++++++++++++++++++++++++++++++-----
Mlazier/cli.py | 19++++++++++++-------
Mlazier/core/config.py | 50++++++++++++++++++++++++++++----------------------
Mlazier/core/processing.py | 67+++++++++++++++++++++++++++++++++++++++++++++----------------------
Mlazier/summarizer.py | 95+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Mlazier/transcriber.py | 194+++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mlazier/web/templates/index.html | 66++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mtests/test_api.py | 23+++++++++++++++++++++++
Mtests/test_processing.py | 72++++++++++++++++++++++++++++++++++++++++++++++++------------------------
Mtests/test_smart_summary.py | 1+
13 files changed, 705 insertions(+), 300 deletions(-)

diff --git a/.env.example b/.env.example @@ -9,48 +9,54 @@ OPENAI_API_KEY= # YOUTUBE_PO_TOKEN=android.gvs+XXX # --------------------------------------------------------------------------- -# Modelos OpenAI (opcional, defaults sao seguros) +# Modelos OpenAI (default = perfil economico / rapido) # --------------------------------------------------------------------------- -# Modelo de chat usado para sumario, conversao para PT-BR e deteccao de tipo. -# Default: gpt-5-mini. Alternativas: gpt-5, gpt-5-nano, gpt-4.1, gpt-4.1-mini, gpt-4o-mini. -# OPENAI_CHAT_MODEL=gpt-5-mini +# Chat: sumario, conversao PT-BR. Default: gpt-4o-mini (barato e rapido). +# Alternativas: gpt-5-nano, gpt-5-mini, gpt-4.1-mini. +# OPENAI_CHAT_MODEL=gpt-4o-mini -# Modelo padrao de transcricao (saida em texto). -# Default: gpt-4o-mini-transcribe. Alternativas: gpt-4o-transcribe, whisper-1. +# Transcricao. Default: gpt-4o-mini-transcribe. # OPENAI_TRANSCRIBE_MODEL=gpt-4o-mini-transcribe -# Modelo usado quando precisamos de timestamps (verbose_json) para gerar capitulos. -# Default: whisper-1 (ainda e o mais confiavel para segments com start/end). +# Timestamps para capitulos (verbose_json). Default: whisper-1. # OPENAI_TRANSCRIBE_TIMESTAMPS_MODEL=whisper-1 -# Liga/desliga geracao de capitulos com timestamps em audio/video. -# OPENAI_ENABLE_CHAPTERS=true +# Capitulos com timestamps (STT mais lento). Default: false. +# OPENAI_ENABLE_CHAPTERS=false -# Liga/desliga sumario estruturado (TL;DR, key points, decisoes, action items, etc). -# Quando false, mantem o sumario textual legado. +# Sumario estruturado (SmartSummary). Default: true. # OPENAI_ENABLE_SMART_SUMMARY=true -# Esforco de raciocinio para modelos da familia gpt-5/o-series. -# Valores validos: minimal, low, medium, high. Default: medium. -# OPENAI_REASONING_EFFORT=medium +# Raciocinio para modelos gpt-5/o-series. Default: minimal. +# OPENAI_REASONING_EFFORT=minimal # --------------------------------------------------------------------------- -# Preset global e sumarizacao hierarquica (textos longos) +# Preset global e sumarizacao # --------------------------------------------------------------------------- -# Preset global (unico controlo): economico | equilibrado | maximo. -# Mapeia defaults de chat, transcricao e reasoning (sobrescritos por OPENAI_* quando definidos). -# LAZIER_QUALITY_PRESET=equilibrado +# economico | equilibrado | maximo. Default: economico. +# LAZIER_QUALITY_PRESET=economico -# Sumario map-reduce com overlap para transcricoes acima do limiar (caracteres). # LAZIER_SUMMARY_HIERARCHICAL=true -# LAZIER_SUMMARY_DIRECT_MAX_CHARS=32000 -# LAZIER_SUMMARY_MAP_CHUNK_CHARS=14000 -# LAZIER_SUMMARY_CHUNK_OVERLAP_CHARS=1400 +# LAZIER_SUMMARY_DIRECT_MAX_CHARS=48000 +# LAZIER_SUMMARY_MAP_CHUNK_CHARS=16000 +# LAZIER_SUMMARY_CHUNK_OVERLAP_CHARS=800 # --------------------------------------------------------------------------- -# Spike STT / diarizacao (opcional, sem segundo fornecedor ligado por defeito) +# Performance (menos chamadas = mais rapido e barato) # --------------------------------------------------------------------------- +# Revisao ortografica extra apos PT-BR (1+ chamada chat). Default: false. +# LAZIER_ENABLE_PT_POLISH=false + +# Classificador de tipo antes do sumario (1 chamada chat). Default: false. +# LAZIER_DETECT_CONTENT_TYPE=false + +# Paralelismo em chunks de audio e sumario. Default: 3. +# LAZIER_STT_PARALLEL_WORKERS=3 +# LAZIER_SUMMARY_PARALLEL_WORKERS=3 + +# LAZIER_ALWAYS_SUMMARY=false +# LAZIER_DIARIZATION_PROVIDER=none # LAZIER_ALT_STT_ENABLED=false diff --git a/README.md b/README.md @@ -28,18 +28,24 @@ Defaults seguros já vêm configurados; sobrescreva apenas se quiser mudar. | Variável | Default | Descrição | |----------|---------|-----------| -| `OPENAI_CHAT_MODEL` | `gpt-5-mini` | Modelo de chat usado para sumário, conversão para PT-BR, detecção de tipo e capítulos. Alternativas: `gpt-5`, `gpt-5-nano`, `gpt-4.1`, `gpt-4o-mini`. | +| `OPENAI_CHAT_MODEL` | `gpt-4o-mini` | Modelo de chat usado para sumário, conversão para PT-BR, detecção de tipo e capítulos. Alternativas: `gpt-5-mini`, `gpt-5-nano`, `gpt-4.1-mini`. | | `OPENAI_TRANSCRIBE_MODEL` | `gpt-4o-mini-transcribe` | Modelo padrão de transcrição (saída em texto). Alternativas: `gpt-4o-transcribe`, `whisper-1`. | | `OPENAI_TRANSCRIBE_TIMESTAMPS_MODEL` | `whisper-1` | Modelo usado quando precisamos de `verbose_json` para gerar capítulos. Atualmente `whisper-1` é o mais confiável para retornar `start`/`end`. | | `OPENAI_ENABLE_SMART_SUMMARY` | `true` | Liga/desliga o sumário estruturado (TL;DR, pontos-chave, decisões, ações, tópicos, citações, perguntas em aberto). Quando `false`, mantém o sumário textual legado. | -| `OPENAI_ENABLE_CHAPTERS` | `true` | Liga/desliga geração de capítulos com timestamps em áudio/vídeo. | -| `OPENAI_REASONING_EFFORT` | `medium` | Esforço de raciocínio para modelos da família `gpt-5`/`o-series`. Valores: `minimal`, `low`, `medium`, `high`. | -| `LAZIER_QUALITY_PRESET` | `equilibrado` | Preset global `economico` / `equilibrado` / `maximo` → defaults de chat, transcrição e `reasoning` (sobrescritos por `OPENAI_*` quando definidos). Defina apenas no `.env`; não há seleção na WebGUI nem campo por pedido na API. | +| `OPENAI_ENABLE_CHAPTERS` | `false` | Capítulos com timestamps (exige STT `verbose_json`, mais lento). | +| `OPENAI_REASONING_EFFORT` | `minimal` | Esforço de raciocínio para modelos da família `gpt-5`/`o-series`. Valores: `minimal`, `low`, `medium`, `high`. | +| `LAZIER_QUALITY_PRESET` | `economico` | Preset global `economico` / `equilibrado` / `maximo` → defaults de chat, transcrição e `reasoning`. | | `LAZIER_SUMMARY_HIERARCHICAL` | `true` | Acima de `LAZIER_SUMMARY_DIRECT_MAX_CHARS`, o sumário inteligente usa map-reduce com chunks e overlap em vez de um único passe sobre o texto completo. | -| `LAZIER_SUMMARY_DIRECT_MAX_CHARS` | `32000` | Limiar (caracteres) para ativar sumarização hierárquica. | -| `LAZIER_SUMMARY_MAP_CHUNK_CHARS` | `14000` | Tamanho alvo de cada chunk no map. | -| `LAZIER_SUMMARY_CHUNK_OVERLAP_CHARS` | `1400` | Sobreposição entre chunks consecutivos. | -| `LAZIER_ALT_STT_ENABLED` | `false` | Reserva para spike de segundo STT ou diarização; incluída para documentar critérios go/no-go sem ligar fornecedor alternativo por defeito. | +| `LAZIER_SUMMARY_DIRECT_MAX_CHARS` | `48000` | Limiar (caracteres) para ativar sumarização hierárquica. | +| `LAZIER_SUMMARY_MAP_CHUNK_CHARS` | `16000` | Tamanho alvo de cada chunk no map. | +| `LAZIER_SUMMARY_CHUNK_OVERLAP_CHARS` | `800` | Sobreposição entre chunks consecutivos. | +| `LAZIER_ENABLE_PT_POLISH` | `false` | Revisão ortográfica extra após PT-BR (mais chamadas chat). | +| `LAZIER_DETECT_CONTENT_TYPE` | `false` | Classificador de tipo antes do sumário (1 chamada chat). | +| `LAZIER_STT_PARALLEL_WORKERS` | `3` | Transcrição paralela de chunks de áudio. | +| `LAZIER_SUMMARY_PARALLEL_WORKERS` | `3` | Sumário inteligente paralelo por chunk. | +| `LAZIER_ALT_STT_ENABLED` | `false` | Reserva para spike de segundo STT ou diarização. | +| `LAZIER_ALWAYS_SUMMARY` | `false` | Se `true`, força geração de sumário mesmo quando o modo pedido é só `transcribe` (override de instalação). | +| `LAZIER_DIARIZATION_PROVIDER` | `none` | Diarização opcional de falantes (`none`, ou provedores futuros). | Opcional: `YOUTUBE_PO_TOKEN` para melhor suporte a alguns vídeos do YouTube ([guia](https://github.com/yt-dlp/yt-dlp/wiki/PO-Token-Guide)). @@ -93,16 +99,20 @@ Acesse **http://localhost:19283** (ou use `--port` para outra porta). ### CLI ```bash -# Transcrição (gera o texto em PT-BR) +# Só transcrição (texto completo em PT-BR — default, mais económico) lazier transcribe audio.mp3 lazier transcribe video.mp4 lazier transcribe "https://www.youtube.com/watch?v=VIDEO_ID" -# Sumário (texto + sumário inteligente + capítulos quando aplicável) +# Só sumário (STT/extração internos; exporta apenas o sumário) lazier summarize document.pdf lazier summarize "https://example.com/artigo" --format md lazier summarize aula.mp3 --gpt-model gpt-5 --reasoning high -# Desligar features novas explicitamente + +# Transcrição + sumário (dois artefactos; maior consumo de tokens) +lazier process aula.mp3 --format md + +# Desligar features opcionais lazier summarize aula.mp3 --no-smart --no-chapters # Outras opções @@ -122,6 +132,28 @@ Flags principais: O pacote custo/qualidade (`economico` / `equilibrado` / `maximo`) vem só de `LAZIER_QUALITY_PRESET` no `.env` (`lazier config` mostra o valor atual). +### Economia de tokens + +| Modo | O que gera | Notas | +|------|------------|-------| +| `transcribe` | Transcrição PT-BR | Não chama sumário inteligente nem detecção de tipo de conteúdo. | +| `summarize` | Sumário PT-BR | Áudio/vídeo ainda passam por STT; transcrição completa não é exportada. | +| `process` | Transcrição + sumário | Dois ficheiros na WebGUI; custo total de chat. | + +Na WebGUI escolha **Transcrição**, **Sumário** ou **Ambos**. Na API, omitir `mode` equivale a `transcribe`. Use `"mode": "process"` ou `transcribe: true, summarize: true` para os dois. + +### Velocidade e custo (defaults) + +O perfil **económico** shipado prioriza latência e preço: + +- **`gpt-4o-mini`** para chat (conversão PT-BR e sumário). +- **Capítulos desligados** — evita STT com `whisper-1` + geração de capítulos. +- **Sem polish nem detecção de tipo** — poupa 1–3 chamadas chat por job. +- **Áudio normalizado** mono 16 kHz antes do STT (ficheiros menores). +- **Chunks STT/sumário em paralelo** (até 3 workers). + +Para máxima qualidade: `LAZIER_QUALITY_PRESET=maximo`, `OPENAI_ENABLE_CHAPTERS=true`, `LAZIER_ENABLE_PT_POLISH=true`. + ### WebGUI Acesse http://localhost:19283 após iniciar com `lazier web` ou Docker. Na primeira vez, faça login com o usuário e senha definidos em `ADMIN_USER` e `ADMIN_PASSWORD` no `.env` (se configurados). @@ -134,7 +166,17 @@ Acesse http://localhost:19283 após iniciar com `lazier web` ou Docker. Na prime { "url": "https://example.com/aula.mp3", "format": "md", - "mode": "summarize", + "mode": "summarize" +} +``` + +Modos: `transcribe` (default), `summarize`, `process` (ambos). Overrides opcionais: + +```json +{ + "url": "https://example.com/aula.mp3", + "format": "md", + "mode": "process", "chat_model": "gpt-5", "transcribe_model": "gpt-4o-transcribe", "smart": true, diff --git a/lazier/api/routes.py b/lazier/api/routes.py @@ -4,6 +4,7 @@ Rotas da API FastAPI. import logging import os +import shutil import tempfile import uuid import zipfile @@ -11,7 +12,7 @@ from datetime import datetime from pathlib import Path from typing import List, Optional, Tuple -from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, UploadFile +from fastapi import APIRouter, BackgroundTasks, File, Form, HTTPException, Query, UploadFile from fastapi.responses import FileResponse from pydantic import BaseModel @@ -31,6 +32,8 @@ router = APIRouter() UPLOAD_DIR = Path(os.getenv("LAZIER_UPLOAD_DIR", "/app/uploads")) UPLOAD_DIR.mkdir(parents=True, exist_ok=True) +VALID_EXPORT_FORMATS = frozenset({"docx", "txt", "md", "json", "pdf"}) + class ProcessRequest(BaseModel): """Request para processar URL.""" @@ -55,12 +58,18 @@ def _resolve_mode( if mode not in {"transcribe", "summarize", "process"}: raise HTTPException( status_code=400, - detail="Modo invalido. Use 'process' (padrao), 'transcribe' ou 'summarize'.", + detail="Modo invalido. Use 'transcribe' (padrao), 'summarize' ou 'process'.", ) return mode if transcribe is None and summarize is None: - return "process" + return "transcribe" + + if transcribe is False and summarize is False: + raise HTTPException( + status_code=400, + detail="Escolha transcrição, sumário ou ambos (transcribe/summarize/process).", + ) if transcribe and summarize: return "process" @@ -69,18 +78,125 @@ def _resolve_mode( if summarize and not transcribe: return "summarize" - return "process" + return "transcribe" + + +def _normalize_export_format(format_type: Optional[str]) -> str: + fmt = (format_type or "docx").strip().lower() + if fmt == "markdown": + fmt = "md" + if fmt not in VALID_EXPORT_FORMATS: + raise HTTPException( + status_code=400, + detail=f"Formato invalido: {format_type}. Use: docx, txt, md, json, pdf.", + ) + return fmt + + +def _job_stored_format(job: dict) -> str: + return _normalize_export_format(job.get("format") or "docx") -def _download_filename(job: dict, artifact_kind: str) -> str: +def _download_filename(job: dict, artifact_kind: str, format_type: Optional[str] = None) -> str: return build_export_filename( job.get("metadata", {}), - job.get("format", "docx"), + _normalize_export_format(format_type or job.get("format")), source_name=job.get("source_name"), artifact_kind=artifact_kind, ) +def _export_artifact_to_path(job: dict, artifact_kind: str, output_path: Path, format_type: str) -> None: + metadata = job.get("metadata", {}) or {} + fmt = _normalize_export_format(format_type) + if artifact_kind == "transcription": + export( + transcription=job["transcription"], + summary=None, + metadata=metadata, + output_path=str(output_path), + format_type=fmt, + ) + elif artifact_kind == "summary": + export( + transcription="", + summary=job["summary"], + metadata=metadata, + output_path=str(output_path), + format_type=fmt, + ) + else: + raise ValueError(f"Artefato invalido: {artifact_kind}") + + +def _resolve_download_path( + job: dict, + artifact_kind: str, + format_type: Optional[str] = None, +) -> Tuple[Optional[str], bool]: + """Resolve caminho de download. Retorna (path, efemero).""" + + if artifact_kind == "result": + mode = job.get("mode") + artifact_kind = "transcription" if mode == "transcribe" else "summary" + + fmt = _normalize_export_format(format_type or job.get("format")) + job_fmt = _job_stored_format(job) + + if artifact_kind == "transcription" and not job.get("transcription"): + return None, False + if artifact_kind == "summary" and not job.get("summary"): + return None, False + + path_key = {"transcription": "transcription_path", "summary": "summary_path"}.get(artifact_kind) + if not path_key: + return None, False + + if fmt == job_fmt: + existing_path = job.get(path_key) + if existing_path and Path(existing_path).exists(): + return existing_path, False + + output_path = build_job_artifact_path( + job_id=job["id"], + source_name=job.get("source_name"), + format_type=fmt, + artifact_kind=artifact_kind, + created_at=job.get("created_at"), + metadata=job.get("metadata", {}), + ) + _export_artifact_to_path(job, artifact_kind, output_path, fmt) + get_job_store().update_job(job["id"], **{path_key: str(output_path)}) + return str(output_path), False + + tmp_dir = Path(tempfile.mkdtemp(prefix="lazier-dl-")) + filename = build_export_filename( + job.get("metadata", {}), + fmt, + source_name=job.get("source_name"), + artifact_kind=artifact_kind, + ) + output_path = tmp_dir / filename + _export_artifact_to_path(job, artifact_kind, output_path, fmt) + return str(output_path), True + + +def _cleanup_download_path(path: str, ephemeral: bool) -> None: + if not ephemeral or not path: + return + file_path = Path(path) + parent = file_path.parent + if parent.name.startswith("lazier-dl-"): + shutil.rmtree(parent, ignore_errors=True) + elif file_path.exists(): + file_path.unlink(missing_ok=True) + + +def _ensure_download_file(job: dict, artifact_kind: str, format_type: Optional[str] = None) -> Optional[str]: + path, _ephemeral = _resolve_download_path(job, artifact_kind, format_type) + return path + + def _job_title(job: dict) -> str: metadata = job.get("metadata", {}) if metadata.get("title"): @@ -228,67 +344,14 @@ def _process_job(job_id: str) -> None: broadcast_progress(job_id, 0, "failed", str(exc)) -def _ensure_download_file(job: dict, artifact_kind: str) -> Optional[str]: - existing_path_key = { - "transcription": "transcription_path", - "summary": "summary_path", - "result": "result_path", - }[artifact_kind] - existing_path = job.get(existing_path_key) - if existing_path and Path(existing_path).exists(): - return existing_path - - if artifact_kind == "transcription" and job.get("transcription"): - output_path = build_job_artifact_path( - job_id=job["id"], - source_name=job.get("source_name"), - format_type=job.get("format", "docx"), - artifact_kind="transcription", - created_at=job.get("created_at"), - metadata=job.get("metadata", {}), - ) - export( - transcription=job["transcription"], - summary=None, - metadata=job.get("metadata", {}), - output_path=str(output_path), - format_type=job.get("format", "docx"), - ) - get_job_store().update_job(job["id"], transcription_path=str(output_path)) - return str(output_path) - - if artifact_kind == "summary" and job.get("summary"): - output_path = build_job_artifact_path( - job_id=job["id"], - source_name=job.get("source_name"), - format_type=job.get("format", "docx"), - artifact_kind="summary", - created_at=job.get("created_at"), - metadata=job.get("metadata", {}), - ) - export( - transcription="", - summary=job["summary"], - metadata=job.get("metadata", {}), - output_path=str(output_path), - format_type=job.get("format", "docx"), - ) - get_job_store().update_job(job["id"], summary_path=str(output_path)) - return str(output_path) - - if artifact_kind == "result": - mode = job.get("mode") - preferred_key = "transcription" if mode == "transcribe" else "summary" - return _ensure_download_file(job, preferred_key) - - return None - - -def _require_distinct_transcription_and_summary_paths(job: dict) -> Tuple[str, str]: - """Resolve paths for bundle ZIP or raise HTTPException.""" +def _require_distinct_transcription_and_summary_paths( + job: dict, + format_type: Optional[str] = None, +) -> Tuple[str, str, bool, bool]: + """Resolve paths for bundle ZIP. Retorna (tx, sm, tx_ephemeral, sm_ephemeral).""" - tx_path = _ensure_download_file(job, "transcription") - sm_path = _ensure_download_file(job, "summary") + tx_path, tx_tmp = _resolve_download_path(job, "transcription", format_type) + sm_path, sm_tmp = _resolve_download_path(job, "summary", format_type) if not tx_path or not sm_path: raise HTTPException( status_code=404, @@ -299,7 +362,7 @@ def _require_distinct_transcription_and_summary_paths(job: dict) -> Tuple[str, s status_code=400, detail="Transcricao e sumario referem-se ao mesmo ficheiro; pacote ZIP indisponivel.", ) - return (tx_path, sm_path) + return tx_path, sm_path, tx_tmp, sm_tmp def _unlink_quiet(path: str) -> None: @@ -453,63 +516,85 @@ async def get_job_details(job_id: str): @router.get("/jobs/{job_id}/transcription") -async def download_transcription(job_id: str): - """Download da transcricao.""" +async def download_transcription( + job_id: str, + background_tasks: BackgroundTasks, + format: Optional[str] = Query(None, description="docx, txt, md, json ou pdf"), +): + """Download da transcricao no formato pedido.""" job = get_job_store().get_job(job_id) if not job: raise HTTPException(status_code=404, detail="Job nao encontrado") - download_path = _ensure_download_file(job, "transcription") + download_path, ephemeral = _resolve_download_path(job, "transcription", format) if not download_path: raise HTTPException(status_code=404, detail="Transcricao nao disponivel") - filename = _download_filename(job, "transcription") + filename = _download_filename(job, "transcription", format) + if ephemeral: + background_tasks.add_task(_cleanup_download_path, download_path, True) return FileResponse(download_path, media_type="application/octet-stream", filename=filename) @router.get("/jobs/{job_id}/summary") -async def download_summary(job_id: str): - """Download do sumario.""" +async def download_summary( + job_id: str, + background_tasks: BackgroundTasks, + format: Optional[str] = Query(None, description="docx, txt, md, json ou pdf"), +): + """Download do sumario no formato pedido.""" job = get_job_store().get_job(job_id) if not job: raise HTTPException(status_code=404, detail="Job nao encontrado") - download_path = _ensure_download_file(job, "summary") + download_path, ephemeral = _resolve_download_path(job, "summary", format) if not download_path: raise HTTPException(status_code=404, detail="Sumario nao disponivel") - filename = _download_filename(job, "summary") + filename = _download_filename(job, "summary", format) + if ephemeral: + background_tasks.add_task(_cleanup_download_path, download_path, True) return FileResponse(download_path, media_type="application/octet-stream", filename=filename) @router.get("/jobs/{job_id}/download") -async def download_result(job_id: str): +async def download_result( + job_id: str, + background_tasks: BackgroundTasks, + format: Optional[str] = Query(None, description="docx, txt, md, json ou pdf"), +): """Download do artefato principal do job.""" job = get_job_store().get_job(job_id) if not job: raise HTTPException(status_code=404, detail="Job nao encontrado") - download_path = _ensure_download_file(job, "result") + download_path, ephemeral = _resolve_download_path(job, "result", format) if not download_path: raise HTTPException(status_code=404, detail="Arquivo de resultado nao encontrado") artifact_kind = "transcription" if job.get("mode") == "transcribe" else "summary" - filename = _download_filename(job, artifact_kind) + filename = _download_filename(job, artifact_kind, format) + if ephemeral: + background_tasks.add_task(_cleanup_download_path, download_path, True) return FileResponse(download_path, media_type="application/octet-stream", filename=filename) @router.get("/jobs/{job_id}/download-bundle") -async def download_bundle(job_id: str, background_tasks: BackgroundTasks): - """Download ZIP com transcricao e sumario em ficheiros separados.""" +async def download_bundle( + job_id: str, + background_tasks: BackgroundTasks, + format: Optional[str] = Query(None, description="docx, txt, md, json ou pdf"), +): + """Download ZIP com transcricao e sumario no formato pedido.""" job = get_job_store().get_job(job_id) if not job: raise HTTPException(status_code=404, detail="Job nao encontrado") - tx_path, sm_path = _require_distinct_transcription_and_summary_paths(job) + tx_path, sm_path, tx_tmp, sm_tmp = _require_distinct_transcription_and_summary_paths(job, format) fd, tmp_name = tempfile.mkstemp(suffix=".zip") os.close(fd) @@ -529,6 +614,10 @@ async def download_bundle(job_id: str, background_tasks: BackgroundTasks): artifact_kind="result", ) background_tasks.add_task(_unlink_quiet, str(tmp_path)) + if tx_tmp: + background_tasks.add_task(_cleanup_download_path, tx_path, True) + if sm_tmp: + background_tasks.add_task(_cleanup_download_path, sm_path, True) return FileResponse(str(tmp_path), media_type="application/zip", filename=zip_filename) diff --git a/lazier/audio_processor.py b/lazier/audio_processor.py @@ -202,6 +202,42 @@ def split_audio(audio_path: str, chunk_size_mb: int = 24) -> list[str]: raise Exception(f"Erro ao dividir áudio: {str(e)}") +def normalize_audio_for_stt(input_path: str) -> str: + """ + Normaliza audio para STT: mono, 16 kHz, MP3 64 kbps. + Ficheiros menores = upload e transcricao mais rapidos. + """ + if not check_ffmpeg(): + return input_path + + path_obj = Path(input_path) + output_path = path_obj.parent / f"{path_obj.stem}_stt.mp3" + + cmd = [ + "ffmpeg", + "-i", + input_path, + "-vn", + "-ac", + "1", + "-ar", + "16000", + "-acodec", + "libmp3lame", + "-b:a", + "64k", + "-y", + str(output_path), + ] + try: + subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) + if output_path.exists(): + return str(output_path) + except subprocess.CalledProcessError: + pass + return input_path + + def prepare_audio_file(input_path: str, is_video: bool = False) -> str: """ Prepara arquivo de áudio para transcrição @@ -215,10 +251,11 @@ def prepare_audio_file(input_path: str, is_video: bool = False) -> str: Caminho do arquivo de áudio preparado """ if is_video: - return extract_audio_from_video(input_path) + prepared = extract_audio_from_video(input_path) else: - # Tenta converter para mp3 se necessário (formato mais compatível) audio_ext = Path(input_path).suffix.lower() - if audio_ext not in ['.mp3', '.wav']: - return convert_audio_format(input_path, 'mp3') - return input_path + if audio_ext not in [".mp3", ".wav", ".m4a", ".ogg"]: + prepared = convert_audio_format(input_path, "mp3") + else: + prepared = input_path + return normalize_audio_for_stt(prepared) diff --git a/lazier/cli.py b/lazier/cli.py @@ -129,9 +129,10 @@ def cli(ctx, input_path): if input_path: console.print( Panel.fit( - "Fluxo unificado: transcrição em PT-BR + sumário (ajuste com LAZIER_ALWAYS_SUMMARY).\n\n" - "`lazier process <input>` — fluxo unificado explícito.\n" - "`lazier transcribe` / `lazier summarize` — modos legados (ver LAZIER_ALWAYS_SUMMARY).", + "Escolha o que gerar para economizar tokens:\n\n" + "`lazier transcribe <input>` — só transcrição em PT-BR\n" + "`lazier summarize <input>` — só sumário\n" + "`lazier process <input>` — transcrição + sumário", title="Modo de Uso", ) ) @@ -186,7 +187,7 @@ def transcribe( chapters_flag: Optional[bool], reasoning: Optional[str], ): - """Converte/transcreve para PT-BR; com LAZIER_ALWAYS_SUMMARY=true também gera sumário.""" + """Gera só a transcrição em PT-BR (sem sumário, salvo LAZIER_ALWAYS_SUMMARY=true).""" _run_mode( input_path=input_path, mode="transcribe", @@ -215,7 +216,7 @@ def summarize( chapters_flag: Optional[bool], reasoning: Optional[str], ): - """Foco em sumário; com LAZIER_ALWAYS_SUMMARY=true também exporta transcrição completa.""" + """Gera só o sumário em PT-BR (STT/extração internos; transcrição não é exportada).""" _run_mode( input_path=input_path, mode="summarize", @@ -244,7 +245,7 @@ def process( chapters_flag: Optional[bool], reasoning: Optional[str], ): - """Executa o pipeline unificado (transcrição + sumário em PT-BR).""" + """Gera transcrição e sumário em PT-BR (maior consumo de tokens).""" _run_mode( input_path=input_path, mode="process", @@ -272,8 +273,12 @@ def config(): f"Smart summary: [cyan]{'on' if cfg.enable_smart_summary else 'off'}[/cyan]\n" f"Capitulos: [cyan]{'on' if cfg.enable_chapters else 'off'}[/cyan]\n" f"Preset qualidade: [cyan]{cfg.quality_preset}[/cyan]\n" + f"Polish PT-BR extra: [cyan]{'on' if cfg.enable_pt_polish else 'off'}[/cyan]\n" + f"Detectar tipo conteudo: [cyan]{'on' if cfg.detect_content_type else 'off'}[/cyan]\n" + f"STT paralelo (workers): [cyan]{cfg.stt_parallel_workers}[/cyan]\n" + f"Sumario paralelo (workers): [cyan]{cfg.summary_parallel_workers}[/cyan]\n" f"Sumario hierarquico: [cyan]{'on' if cfg.hierarchical_summary else 'off'}[/cyan]\n" - f"Sempre gerar sumario (LAZIER_ALWAYS_SUMMARY): [cyan]{'on' if cfg.always_summary else 'off'}[/cyan]\n" + f"Sempre gerar sumario (LAZIER_ALWAYS_SUMMARY): [cyan]{'on' if cfg.always_summary else 'off'}[/cyan] — override admin\n" f"Diarizacao (LAZIER_DIARIZATION_PROVIDER): [cyan]{cfg.diarization_provider or 'none'}[/cyan]\n" f"STT alternativo (flag): [cyan]{'on' if cfg.alt_stt_enabled else 'off'}[/cyan]", title="Lazier - Config Atual", diff --git a/lazier/core/config.py b/lazier/core/config.py @@ -22,15 +22,13 @@ except ImportError: # pragma: no cover load_dotenv() -# Defaults shipados (Maio 2026): -# - chat: gpt-5-mini -> equilibrio qualidade/custo -# - transcribe: gpt-4o-mini-transcribe -> sucessor de whisper-1, mais preciso -# - timestamps: whisper-1 -> ainda e a forma confiavel de obter `verbose_json` -# com segments para construir capitulos. -DEFAULT_CHAT_MODEL = "gpt-5-mini" +# Defaults shipados (Maio 2026) — perfil economico: rapido e barato. +# Sobrescreva via LAZIER_QUALITY_PRESET=maximo ou OPENAI_* no .env. +DEFAULT_CHAT_MODEL = "gpt-4o-mini" DEFAULT_TRANSCRIBE_MODEL = "gpt-4o-mini-transcribe" DEFAULT_TRANSCRIBE_TIMESTAMPS_MODEL = "whisper-1" -DEFAULT_REASONING_EFFORT = "medium" +DEFAULT_REASONING_EFFORT = "minimal" +DEFAULT_QUALITY_PRESET = "economico" VALID_REASONING_EFFORTS = {"minimal", "low", "medium", "high"} @@ -39,7 +37,7 @@ VALID_QUALITY_PRESETS = frozenset({"economico", "equilibrado", "maximo"}) # Presets: custo vs qualidade (sobrescritos por OPENAI_* quando definidos). QUALITY_PRESETS: Dict[str, Dict[str, str]] = { "economico": { - "chat_model": "gpt-5-nano", + "chat_model": "gpt-4o-mini", "transcribe_model": "gpt-4o-mini-transcribe", "reasoning_effort": "minimal", }, @@ -57,8 +55,8 @@ QUALITY_PRESETS: Dict[str, Dict[str, str]] = { def resolve_quality_preset(name: Optional[str]) -> str: - key = (name or "equilibrado").strip().lower() - return key if key in VALID_QUALITY_PRESETS else "equilibrado" + key = (name or DEFAULT_QUALITY_PRESET).strip().lower() + return key if key in VALID_QUALITY_PRESETS else DEFAULT_QUALITY_PRESET def get_preset_model_defaults(preset: Optional[str]) -> Dict[str, str]: @@ -98,17 +96,21 @@ class ModelConfig: chat_model: str = DEFAULT_CHAT_MODEL transcribe_model: str = DEFAULT_TRANSCRIBE_MODEL transcribe_timestamps_model: str = DEFAULT_TRANSCRIBE_TIMESTAMPS_MODEL - enable_chapters: bool = True + enable_chapters: bool = False enable_smart_summary: bool = True reasoning_effort: str = DEFAULT_REASONING_EFFORT - quality_preset: str = "equilibrado" + quality_preset: str = DEFAULT_QUALITY_PRESET hierarchical_summary: bool = True - summary_direct_max_chars: int = 32_000 - summary_map_chunk_chars: int = 14_000 - summary_chunk_overlap_chars: int = 1_400 + summary_direct_max_chars: int = 48_000 + summary_map_chunk_chars: int = 16_000 + summary_chunk_overlap_chars: int = 800 alt_stt_enabled: bool = False - always_summary: bool = True + always_summary: bool = False diarization_provider: str = "none" + enable_pt_polish: bool = False + detect_content_type: bool = False + stt_parallel_workers: int = 3 + summary_parallel_workers: int = 3 def supports_reasoning(self, model: Optional[str] = None) -> bool: """Indica se devemos enviar `reasoning_effort` para um modelo.""" @@ -133,7 +135,7 @@ def get_model_config(refresh: bool = False) -> ModelConfig: if _cached_config is not None and not refresh: return _cached_config - preset_key = resolve_quality_preset(_env_str("LAZIER_QUALITY_PRESET", "equilibrado")) + preset_key = resolve_quality_preset(_env_str("LAZIER_QUALITY_PRESET", DEFAULT_QUALITY_PRESET)) pd = QUALITY_PRESETS[preset_key] reasoning = _env_str("OPENAI_REASONING_EFFORT", pd["reasoning_effort"]).lower() @@ -146,17 +148,21 @@ def get_model_config(refresh: bool = False) -> ModelConfig: transcribe_timestamps_model=_env_str( "OPENAI_TRANSCRIBE_TIMESTAMPS_MODEL", DEFAULT_TRANSCRIBE_TIMESTAMPS_MODEL ), - enable_chapters=_env_bool("OPENAI_ENABLE_CHAPTERS", True), + enable_chapters=_env_bool("OPENAI_ENABLE_CHAPTERS", False), enable_smart_summary=_env_bool("OPENAI_ENABLE_SMART_SUMMARY", True), reasoning_effort=reasoning, quality_preset=preset_key, hierarchical_summary=_env_bool("LAZIER_SUMMARY_HIERARCHICAL", True), - summary_direct_max_chars=_env_int("LAZIER_SUMMARY_DIRECT_MAX_CHARS", 32_000, min_value=2_000), - summary_map_chunk_chars=_env_int("LAZIER_SUMMARY_MAP_CHUNK_CHARS", 14_000, min_value=2_000), - summary_chunk_overlap_chars=_env_int("LAZIER_SUMMARY_CHUNK_OVERLAP_CHARS", 1_400, min_value=0), + summary_direct_max_chars=_env_int("LAZIER_SUMMARY_DIRECT_MAX_CHARS", 48_000, min_value=2_000), + summary_map_chunk_chars=_env_int("LAZIER_SUMMARY_MAP_CHUNK_CHARS", 16_000, min_value=2_000), + summary_chunk_overlap_chars=_env_int("LAZIER_SUMMARY_CHUNK_OVERLAP_CHARS", 800, min_value=0), alt_stt_enabled=_env_bool("LAZIER_ALT_STT_ENABLED", False), - always_summary=_env_bool("LAZIER_ALWAYS_SUMMARY", True), + always_summary=_env_bool("LAZIER_ALWAYS_SUMMARY", False), diarization_provider=_env_str("LAZIER_DIARIZATION_PROVIDER", "none").lower(), + enable_pt_polish=_env_bool("LAZIER_ENABLE_PT_POLISH", False), + detect_content_type=_env_bool("LAZIER_DETECT_CONTENT_TYPE", False), + stt_parallel_workers=_env_int("LAZIER_STT_PARALLEL_WORKERS", 3, min_value=1), + summary_parallel_workers=_env_int("LAZIER_SUMMARY_PARALLEL_WORKERS", 3, min_value=1), ) return _cached_config diff --git a/lazier/core/processing.py b/lazier/core/processing.py @@ -8,7 +8,7 @@ Inclui as etapas de: - Diarizacao opcional (LAZIER_DIARIZATION_PROVIDER) antes da conversao para PT-BR - Conversao para PT-BR + revisao ortografica leve (polish_pt_br_text) - Deteccao de tipo de conteudo -- Sumario (legado ou estruturado; LAZIER_ALWAYS_SUMMARY=false recupera fluxos antigos) +- Sumario (legado ou estruturado; LAZIER_ALWAYS_SUMMARY forca sumario mesmo em transcribe) - Capitulos com timestamps - Export final (dois artefactos quando transcrição + sumário e saida por job) """ @@ -98,7 +98,25 @@ def _ensure_mode(mode: str) -> str: return mode -def _maybe_polish_pt_br(text: Optional[str], kind: str, runtime: Dict[str, Any]) -> Optional[str]: +def _wants_summary(mode: str, always_summary: bool) -> bool: + return mode in {"summarize", "process"} or always_summary + + +def _wants_transcription_output(mode: str) -> bool: + return mode in {"transcribe", "process"} + + +def _wants_summary_output(mode: str) -> bool: + return mode in {"summarize", "process"} + + +def _maybe_polish_pt_br( + text: Optional[str], + kind: str, + runtime: Dict[str, Any], +) -> Optional[str]: + if not runtime.get("pt_polish_enabled", False): + return text if not text or not str(text).strip(): return text try: @@ -136,6 +154,8 @@ def _resolve_runtime( "reasoning_effort": config.reasoning_effort, "quality_preset": config.quality_preset, "always_summary": config.always_summary, + "pt_polish_enabled": config.enable_pt_polish, + "detect_content_type": config.detect_content_type, } @@ -267,6 +287,9 @@ def _detect_and_attach_content_type( return None if metadata.get("content_type"): return metadata["content_type"] + if not runtime.get("detect_content_type", False): + metadata.setdefault("content_type", "other") + return metadata["content_type"] detection = detect_content_type(text, metadata=metadata, model=runtime["chat_model"]) metadata["content_type"] = detection.get("content_type", "other") metadata["content_type_confidence"] = detection.get("confidence", 0.0) @@ -597,10 +620,16 @@ def process_source( metadata = {**metadata, "webpage_url": source} # ---- Etapas pos-transcricao comuns a todos os tipos ---- - if portuguese_text: + always_sum = runtime["always_summary"] + wants_summary = _wants_summary(mode, always_sum) + + if not wants_summary and summary: + summary = None + + if portuguese_text and _wants_transcription_output(mode): portuguese_text = _maybe_polish_pt_br(portuguese_text, "transcription", runtime) or "" - if portuguese_text: + if portuguese_text and wants_summary: _detect_and_attach_content_type(portuguese_text, metadata, runtime) chapters = _build_chapters_if_possible( @@ -610,11 +639,8 @@ def process_source( progress_callback=progress_callback, ) - always_sum = runtime["always_summary"] text_for_summary = (portuguese_text or "").strip() - should_build_summary = bool(text_for_summary) and ( - always_sum or mode in {"summarize", "process"} - ) + should_build_summary = bool(text_for_summary) and wants_summary if should_build_summary and not summary: summary = _build_summary( @@ -636,7 +662,7 @@ def process_source( "open_questions": [], } - if summary: + if summary and _wants_summary_output(mode): summary = _maybe_polish_pt_br(summary, "summary", runtime) # ---- Atualiza caches "ricos" para fontes de midia ---- @@ -683,14 +709,9 @@ def process_source( resolved_source_name = source_name or metadata.get("title") or source - has_tx = bool(portuguese_text and str(portuguese_text).strip()) - has_sm = bool(summary and str(summary).strip()) - dual_artifacts = ( - not output_path - and has_tx - and has_sm - and (runtime["always_summary"] or mode == "process") - ) + has_tx = bool(portuguese_text and str(portuguese_text).strip()) and _wants_transcription_output(mode) + has_sm = bool(summary and str(summary).strip()) and _wants_summary_output(mode) + dual_artifacts = not output_path and has_tx and has_sm and mode == "process" _notify(progress_callback, 92, "processing", f"Gerando arquivo {output_format.upper()}...") @@ -799,16 +820,18 @@ def process_source( raise Exception("Nenhum conteudo gerado para exportar.") output_dir = str(Path(result_path or ".").parent) + result_transcription = portuguese_text if _wants_transcription_output(mode) else None + result_summary = summary if _wants_summary_output(mode) else None result = { "mode": mode, "input_type": input_type, "source_name": resolved_source_name, "metadata": metadata, - "transcription": portuguese_text, - "summary": summary, - "smart_summary": metadata.get("smart_summary"), - "chapters": chapters, - "content_type": metadata.get("content_type"), + "transcription": result_transcription, + "summary": result_summary, + "smart_summary": metadata.get("smart_summary") if _wants_summary_output(mode) else None, + "chapters": chapters if _wants_transcription_output(mode) or _wants_summary_output(mode) else [], + "content_type": metadata.get("content_type") if wants_summary else None, "result_path": result_path, "transcription_path": transcription_path, "summary_path": summary_path, diff --git a/lazier/summarizer.py b/lazier/summarizer.py @@ -16,7 +16,8 @@ from __future__ import annotations import json import os -from typing import Any, Dict, List, Optional +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Any, Dict, List, Optional, Tuple try: from openai import OpenAI @@ -535,21 +536,14 @@ def summarize_smart( language, reasoning_effort=reasoning_effort, ).model_dump() - parciais_h: List[SmartSummary] = [] - for index, chunk in enumerate(chunks): - print(f"Sumario inteligente parte {index + 1}/{len(chunks)}...") - try: - parciais_h.append( - _summarize_smart_chunk( - chunk, - chosen_model, - content_type, - language, - reasoning_effort=reasoning_effort, - ) - ) - except Exception as exc: - print(f"Erro ao sumarizar parte {index + 1}: {exc}") + parciais_h = _collect_smart_partials( + chunks, + chosen_model, + content_type, + language, + reasoning_effort, + label="Sumario inteligente hierarquico", + ) if not parciais_h: return SmartSummary(tldr="Falha ao gerar sumario: todas as partes falharam.").model_dump() if len(parciais_h) == 1: @@ -573,21 +567,14 @@ def summarize_smart( ).model_dump() print(f"Texto longo detectado ({len(text)} chars) - rodando sumario inteligente em {len(chunks)} partes.") - parciais: List[SmartSummary] = [] - for index, chunk in enumerate(chunks): - print(f"Sumario inteligente parte {index + 1}/{len(chunks)}...") - try: - parciais.append( - _summarize_smart_chunk( - chunk, - chosen_model, - content_type, - language, - reasoning_effort=reasoning_effort, - ) - ) - except Exception as exc: - print(f"Erro ao sumarizar parte {index + 1}: {exc}") + parciais = _collect_smart_partials( + chunks, + chosen_model, + content_type, + language, + reasoning_effort, + label="Sumario inteligente", + ) if not parciais: return SmartSummary(tldr="Falha ao gerar sumario: todas as partes falharam.").model_dump() @@ -674,6 +661,52 @@ def _summarize_smart_chunk( return _summarize_smart_fallback_json(client, model, messages, reasoning_effort=reasoning_effort) +def _collect_smart_partials( + chunks: List[str], + chosen_model: str, + content_type: Optional[str], + language: str, + reasoning_effort: Optional[str], + *, + label: str = "Sumario inteligente", +) -> List[SmartSummary]: + """Map de chunks para SmartSummary; paralelo quando LAZIER_SUMMARY_PARALLEL_WORKERS > 1.""" + config = get_model_config() + total = len(chunks) + workers = min(config.summary_parallel_workers, total) if total > 1 else 1 + + def _one(index: int, chunk: str) -> Tuple[int, Optional[SmartSummary]]: + print(f"{label} parte {index + 1}/{total}...") + try: + return index, _summarize_smart_chunk( + chunk, + chosen_model, + content_type, + language, + reasoning_effort=reasoning_effort, + ) + except Exception as exc: + print(f"Erro ao sumarizar parte {index + 1}: {exc}") + return index, None + + if workers <= 1: + ordered: List[SmartSummary] = [] + for index, chunk in enumerate(chunks): + _, partial = _one(index, chunk) + if partial: + ordered.append(partial) + return ordered + + slots: List[Optional[SmartSummary]] = [None] * total + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = [pool.submit(_one, index, chunk) for index, chunk in enumerate(chunks)] + for future in as_completed(futures): + idx, partial = future.result() + if partial: + slots[idx] = partial + return [item for item in slots if item is not None] + + def _summarize_smart_fallback_json( client: "OpenAI", model: str, diff --git a/lazier/transcriber.py b/lazier/transcriber.py @@ -11,13 +11,15 @@ Suporta dois modos: forma mais confiavel de obter `verbose_json`. Ambos lidam com chunking automatico para arquivos > 24MB (limite da API e ~25MB). +Chunks podem ser transcritos em paralelo (LAZIER_STT_PARALLEL_WORKERS). """ from __future__ import annotations import os +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple try: from openai import OpenAI @@ -32,7 +34,7 @@ except ImportError: # pragma: no cover - ambiente sem python-dotenv load_dotenv() -from .audio_processor import split_audio +from .audio_processor import get_audio_duration, split_audio from .core.config import get_model_config @@ -69,6 +71,64 @@ def _resolve_chunks(audio_path: str) -> List[str]: return split_audio(audio_path, chunk_size_mb=24) +def _transcribe_plain_chunk( + client: "OpenAI", + chunk_path: str, + chosen_model: str, + language: Optional[str], + index: int, + total: int, +) -> Tuple[int, str]: + if total > 1: + print(f"Processando chunk {index + 1}/{total}...") + with open(chunk_path, "rb") as audio_file: + request_kwargs: Dict[str, Any] = { + "model": chosen_model, + "file": audio_file, + "response_format": "text", + } + if language: + request_kwargs["language"] = language + transcript = client.audio.transcriptions.create(**request_kwargs) + + if hasattr(transcript, "text"): + text = transcript.text + elif isinstance(transcript, str): + text = transcript + else: + text = str(transcript) + return index, (text or "").strip() + + +def _transcribe_verbose_chunk( + client: "OpenAI", + chunk_path: str, + chosen_model: str, + language: Optional[str], + index: int, + total: int, +) -> Tuple[int, Dict[str, Any]]: + if total > 1: + print(f"Processando chunk {index + 1}/{total} (com timestamps)...") + with open(chunk_path, "rb") as audio_file: + request_kwargs: Dict[str, Any] = { + "model": chosen_model, + "file": audio_file, + "response_format": "verbose_json", + } + if language: + request_kwargs["language"] = language + response = client.audio.transcriptions.create(**request_kwargs) + return index, _normalize_verbose_response(response) + + +def _parallel_workers(total_chunks: int) -> int: + config = get_model_config() + if total_chunks <= 1: + return 1 + return min(config.stt_parallel_workers, total_chunks) + + def transcribe_audio( audio_path: str, language: Optional[str] = None, @@ -85,30 +145,36 @@ def transcribe_audio( try: chunks = _resolve_chunks(audio_path) - transcriptions: List[str] = [] total = len(chunks) - for index, chunk_path in enumerate(chunks): - if total > 1: - print(f"Processando chunk {index + 1}/{total}...") - with open(chunk_path, "rb") as audio_file: - request_kwargs: Dict[str, Any] = { - "model": chosen_model, - "file": audio_file, - "response_format": "text", - } - if language: - request_kwargs["language"] = language - transcript = client.audio.transcriptions.create(**request_kwargs) - - if hasattr(transcript, "text"): - text = transcript.text - elif isinstance(transcript, str): - text = transcript - else: - text = str(transcript) - transcriptions.append((text or "").strip()) - - return " ".join(part for part in transcriptions if part) + workers = _parallel_workers(total) + + if workers <= 1: + transcriptions: List[str] = [] + for index, chunk_path in enumerate(chunks): + _, text = _transcribe_plain_chunk( + client, chunk_path, chosen_model, language, index, total + ) + transcriptions.append(text) + return " ".join(part for part in transcriptions if part) + + ordered: List[str] = [""] * total + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = [ + pool.submit( + _transcribe_plain_chunk, + client, + chunk_path, + chosen_model, + language, + index, + total, + ) + for index, chunk_path in enumerate(chunks) + ] + for future in as_completed(futures): + idx, text = future.result() + ordered[idx] = text + return " ".join(part for part in ordered if part) except Exception as exc: raise _wrap_openai_error(exc) from exc @@ -118,18 +184,7 @@ def transcribe_audio_with_timestamps( language: Optional[str] = None, model: Optional[str] = None, ) -> Dict[str, Any]: - """Transcreve com `verbose_json` retornando texto e segmentos com timestamps. - - Returns: - { - "text": str, # texto completo concatenado - "segments": [ # lista ordenada com tempos absolutos - {"start": float, "end": float, "text": str}, - ... - ], - "duration": float | None, # duracao total estimada (s) - } - """ + """Transcreve com `verbose_json` retornando texto e segmentos com timestamps.""" if not os.path.exists(audio_path): raise FileNotFoundError(f"Arquivo de audio nao encontrado: {audio_path}") @@ -140,33 +195,58 @@ def transcribe_audio_with_timestamps( try: chunks = _resolve_chunks(audio_path) + total = len(chunks) + workers = _parallel_workers(total) + + offsets: List[float] = [] + acc = 0.0 + for chunk_path in chunks: + offsets.append(acc) + duration = get_audio_duration(chunk_path) + acc += duration if duration > 0 else 0.0 + + if workers <= 1: + payloads = [] + for index, chunk_path in enumerate(chunks): + _, payload = _transcribe_verbose_chunk( + client, chunk_path, chosen_model, language, index, total + ) + payloads.append(payload) + else: + payloads = [None] * total # type: ignore[list-item] + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = [ + pool.submit( + _transcribe_verbose_chunk, + client, + chunk_path, + chosen_model, + language, + index, + total, + ) + for index, chunk_path in enumerate(chunks) + ] + for future in as_completed(futures): + idx, payload = future.result() + payloads[idx] = payload + all_segments: List[Dict[str, Any]] = [] all_text_parts: List[str] = [] offset_seconds = 0.0 - total = len(chunks) - for index, chunk_path in enumerate(chunks): - if total > 1: - print(f"Processando chunk {index + 1}/{total} (com timestamps)...") - with open(chunk_path, "rb") as audio_file: - request_kwargs: Dict[str, Any] = { - "model": chosen_model, - "file": audio_file, - "response_format": "verbose_json", - } - if language: - request_kwargs["language"] = language - response = client.audio.transcriptions.create(**request_kwargs) - - payload = _normalize_verbose_response(response) + for index, payload in enumerate(payloads): + if not payload: + continue + chunk_offset = offsets[index] if index < len(offsets) else offset_seconds chunk_duration = payload.get("duration") chunk_segments = payload.get("segments") or [] for seg in chunk_segments: all_segments.append( { - "start": float(seg.get("start", 0.0)) + offset_seconds, - "end": float(seg.get("end", 0.0)) + offset_seconds, + "start": float(seg.get("start", 0.0)) + chunk_offset, + "end": float(seg.get("end", 0.0)) + chunk_offset, "text": (seg.get("text") or "").strip(), } ) @@ -175,10 +255,9 @@ def transcribe_audio_with_timestamps( if chunk_text: all_text_parts.append(chunk_text) - if isinstance(chunk_duration, (int, float)): - offset_seconds += float(chunk_duration) + if isinstance(chunk_duration, (int, float)) and float(chunk_duration) > 0: + offset_seconds = max(offset_seconds, chunk_offset + float(chunk_duration)) elif chunk_segments: - # Fallback: usa o final do ultimo segmento como nova base. offset_seconds = max(seg["end"] for seg in all_segments) return { @@ -196,7 +275,6 @@ def _normalize_verbose_response(response: Any) -> Dict[str, Any]: return response.model_dump() if isinstance(response, dict): return response - # ultimo recurso: tenta acessar atributos diretamente return { "text": getattr(response, "text", ""), "segments": getattr(response, "segments", []) or [], diff --git a/lazier/web/templates/index.html b/lazier/web/templates/index.html @@ -116,7 +116,7 @@ cursor: pointer; } - .mode-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 8px; } + .mode-grid { display: grid; grid-template-columns: repeat(3, 1fr); gap: 8px; } .mode { border: 1px solid var(--border); border-radius: var(--radius); @@ -195,7 +195,18 @@ .bar { height: 4px; background: var(--border); border-radius: 2px; overflow: hidden; margin-top: 12px; } .bar span { display: block; height: 100%; background: var(--accent); } - .actions { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 12px; } + .actions { display: flex; flex-direction: column; gap: 10px; margin-top: 12px; } + .download-row { display: flex; flex-wrap: wrap; align-items: center; gap: 6px; } + .download-label { font-size: 0.75rem; font-weight: 500; color: var(--muted); min-width: 88px; } + .download-formats { display: flex; flex-wrap: wrap; gap: 4px; } + .action.fmt { + padding: 6px 10px; + min-height: 32px; + font-size: 0.75rem; + font-weight: 600; + letter-spacing: 0.02em; + } + .action-row-inline { display: flex; flex-wrap: wrap; gap: 8px; } .action { font-size: 0.8125rem; padding: 8px 12px; @@ -329,17 +340,26 @@ <section id="page-process" class="page active"> <div class="section-title">Novo processamento</div> - <p class="section-lead">Gera transcrição completa e sumário em PT-BR; escolha no download o que quer guardar.</p> + <p class="section-lead">Escolha o que gerar; transcrição e sumário juntos consomem mais tokens da API.</p> <p class="subtle" style="margin-top:-8px;">Diarização de falantes é opcional (LAZIER_DIARIZATION_PROVIDER); sem ela, o texto não traz separação por falante.</p> <div class="stack"> <div class="upload" id="uploadArea"><input type="file" id="fileInput" hidden multiple accept="audio/*,video/*,application/pdf,text/plain,text/markdown,text/html,.md,.htm,.txt"></div> <div class="field"><label for="urlInput">URL (opcional)</label><input type="text" id="urlInput" placeholder="YouTube, página, áudio em linha…"></div> + <div class="field"> + <label>Saída</label> + <div class="mode-grid"> + <div class="mode selected" onclick="selectMode('transcribe',this)"><input type="radio" checked value="transcribe"><strong>Transcrição</strong><span class="meta">Texto completo em PT-BR.</span></div> + <div class="mode" onclick="selectMode('summarize',this)"><input type="radio" value="summarize"><strong>Sumário</strong><span class="meta">Síntese em PT-BR.</span></div> + <div class="mode" onclick="selectMode('process',this)"><input type="radio" value="process"><strong>Ambos</strong><span class="meta">Transcrição + sumário.</span></div> + </div> + </div> <div class="row-actions"> <div class="field"> - <label for="formatSelect">Formato</label> + <label for="formatSelect">Formato principal</label> <select id="formatSelect"><option value="docx">DOCX</option><option value="txt">TXT</option><option value="md">Markdown</option><option value="json">JSON</option><option value="pdf">PDF</option></select> </div> + <p class="subtle" style="margin:0; flex:1 1 100%;">Após concluir, pode descarregar em qualquer formato (DOCX a PDF).</p> <button class="btn" id="processBtn" onclick="processFiles()">Processar</button> </div> </div> @@ -383,9 +403,16 @@ <script> let selectedFiles = []; - let processingMode = 'process'; + let processingMode = 'transcribe'; let allJobs = []; let currentFilter = 'all'; + const EXPORT_FORMATS = [ + { id: 'docx', label: 'DOCX' }, + { id: 'txt', label: 'TXT' }, + { id: 'md', label: 'MD' }, + { id: 'json', label: 'JSON' }, + { id: 'pdf', label: 'PDF' }, + ]; document.addEventListener('DOMContentLoaded', () => { document.getElementById('currentYear').textContent = new Date().getFullYear(); @@ -414,6 +441,12 @@ } function removeFile(index) { selectedFiles.splice(index, 1); document.getElementById('fileInput').value = ''; renderUpload(); } + function selectMode(mode, element) { + processingMode = mode; + document.querySelectorAll('.mode').forEach((node) => node.classList.remove('selected')); + element.classList.add('selected'); + } + function showPage(page) { document.querySelectorAll('.page').forEach((node) => node.classList.remove('active')); document.querySelectorAll('.nav a').forEach((node) => node.classList.remove('active')); @@ -468,29 +501,34 @@ return `<div class="job-head"><div><div class="job-title">${escapeHtml(title)}</div><div class="chips">${modeChip ? `<span class="chip">${modeChip}</span>` : ''}${job.format ? `<span class="chip">${job.format.toUpperCase()}</span>` : ''}${job.created_at ? `<span class="chip">${new Date(job.created_at).toLocaleString('pt-BR')}</span>` : ''}</div></div><span class="status ${job.status}">${statusLabel}</span></div><div class="bar"><span style="width:${job.progress || 0}%"></span></div>${job.error ? `<div class="error">${escapeHtml(job.error)}</div>` : ''}${renderActions(job)}`; } + function formatDownloadButtons(jobId, artifact) { + return EXPORT_FORMATS.map((fmt) => + `<a class="action fmt" href="/api/jobs/${jobId}/${artifact}?format=${fmt.id}" title="Descarregar ${artifact === 'transcription' ? 'transcrição' : 'sumário'} em ${fmt.label}">${fmt.label}</a>` + ).join(''); + } + function renderActions(job) { if (job.status !== 'completed' && job.status !== 'interrupted') return ''; const id = job.id; let html = '<div class="actions">'; - const hrefTx = `/api/jobs/${id}/transcription`; - const hrefSum = `/api/jobs/${id}/summary`; - const hrefDl = `/api/jobs/${id}/download`; - const hrefBundle = `/api/jobs/${id}/download-bundle`; if (job.has_transcription) { - html += `<a class="action" href="${hrefTx}">Transcrição</a>`; + html += `<div class="download-row"><span class="download-label">Transcrição</span><div class="download-formats">${formatDownloadButtons(id, 'transcription')}</div></div>`; } if (job.has_summary) { - html += `<a class="action" href="${hrefSum}">Sumário</a>`; + html += `<div class="download-row"><span class="download-label">Sumário</span><div class="download-formats">${formatDownloadButtons(id, 'summary')}</div></div>`; } if (job.has_transcription && job.has_summary) { - html += `<a class="action" href="${hrefBundle}">Pacote ZIP</a>`; + html += `<div class="download-row"><span class="download-label">Pacote</span><div class="download-formats">${EXPORT_FORMATS.map((fmt) => + `<a class="action fmt" href="/api/jobs/${id}/download-bundle?format=${fmt.id}">ZIP ${fmt.label}</a>` + ).join('')}</div></div>`; } if (!job.has_transcription && !job.has_summary && job.result_path) { - html += `<a class="action" href="${hrefDl}">Download</a>`; + const artifact = job.mode === 'summarize' ? 'summary' : 'transcription'; + html += `<div class="download-row"><span class="download-label">Download</span><div class="download-formats">${formatDownloadButtons(id, artifact)}</div></div>`; } - html += `<button type="button" class="action" onclick="viewJobDetails('${id}')">Visualizar</button></div>`; + html += `<div class="action-row-inline"><button type="button" class="action" onclick="viewJobDetails('${id}')">Visualizar</button></div></div>`; return html; } diff --git a/tests/test_api.py b/tests/test_api.py @@ -34,6 +34,15 @@ class ApiTests(unittest.TestCase): def tearDown(self): shutil.rmtree(self.temp_dir, ignore_errors=True) + def test_process_defaults_to_transcribe_without_mode(self): + response = self.client.post( + "/api/process", + json={"url": "https://example.com/page", "format": "txt"}, + ) + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.json().get("mode"), "transcribe") + def test_process_unified_when_both_legacy_flags(self): response = self.client.post( "/api/process", @@ -43,6 +52,20 @@ class ApiTests(unittest.TestCase): self.assertEqual(response.status_code, 200) self.assertEqual(response.json().get("mode"), "process") + def test_process_rejects_both_flags_false(self): + response = self.client.post( + "/api/process", + json={ + "url": "https://example.com/page", + "format": "txt", + "transcribe": False, + "summarize": False, + }, + ) + + self.assertEqual(response.status_code, 400) + self.assertIn("Escolha", response.json()["detail"]) + def test_upload_persists_job_and_downloads_from_store(self): output_dir = Path(os.environ["LAZIER_OUTPUT_DIR"]) / "2026" / "03" / "31" / "sample-job" output_dir.mkdir(parents=True, exist_ok=True) diff --git a/tests/test_processing.py b/tests/test_processing.py @@ -16,7 +16,7 @@ class ProcessingTests(unittest.TestCase): os.environ["OPENAI_API_KEY"] = "test-key" os.environ["LAZIER_OUTPUT_DIR"] = str(self.temp_dir) os.environ["LAZIER_DATA_DIR"] = str(self.temp_dir) - # Garantir defaults previsiveis para os testes legados. + os.environ["LAZIER_ALWAYS_SUMMARY"] = "false" os.environ.pop("OPENAI_ENABLE_SMART_SUMMARY", None) os.environ.pop("OPENAI_ENABLE_CHAPTERS", None) reset_model_config_cache() @@ -25,7 +25,7 @@ class ProcessingTests(unittest.TestCase): shutil.rmtree(self.temp_dir, ignore_errors=True) reset_model_config_cache() - def test_audio_transcribe_generates_portuguese_transcription_file(self): + def test_audio_transcribe_skips_summary(self): audio_path = self.temp_dir / "sample.mp3" audio_path.write_bytes(b"fake-audio") @@ -38,10 +38,10 @@ class ProcessingTests(unittest.TestCase): "lazier.core.processing.render_text_in_portuguese", return_value="Olá mundo" ), patch( "lazier.core.processing.summarize_text", return_value="Resumo do áudio." - ), patch( + ) as mock_summary, patch( "lazier.core.processing.detect_content_type", return_value={"content_type": "podcast", "confidence": 0.9, "rationale": ""}, - ): + ) as mock_detect: result = process_source( str(audio_path), mode="transcribe", @@ -53,18 +53,52 @@ class ProcessingTests(unittest.TestCase): output_root=self.temp_dir, ) + mock_summary.assert_not_called() + mock_detect.assert_not_called() + self.assertEqual(result["transcription"], "Olá mundo") + self.assertIsNone(result["summary"]) + self.assertIsNone(result["content_type"]) + self.assertIsNotNone(result["transcription_path"]) + self.assertIsNone(result["summary_path"]) + self.assertEqual(result["result_path"], result["transcription_path"]) + self.assertTrue(Path(result["transcription_path"]).exists()) + + def test_audio_process_generates_both_artifacts(self): + audio_path = self.temp_dir / "both.mp3" + audio_path.write_bytes(b"fake-audio") + + with patch("lazier.core.processing.transcribe_audio", return_value="Hello world"), patch( + "lazier.core.processing.maybe_enrich_transcript_with_diarization", + lambda _p, raw, segs, _md: (raw, segs), + ), patch( + "lazier.core.processing.polish_pt_br_text", side_effect=lambda text, **kwargs: text + ), patch( + "lazier.core.processing.render_text_in_portuguese", return_value="Olá mundo" + ), patch( + "lazier.core.processing.summarize_text", return_value="Resumo do áudio." + ), patch( + "lazier.core.processing.detect_content_type", + return_value={"content_type": "podcast", "confidence": 0.9, "rationale": ""}, + ): + result = process_source( + str(audio_path), + mode="process", + output_format="txt", + use_smart_summary=False, + use_chapters=False, + run_id="job-both", + source_name="both.mp3", + output_root=self.temp_dir, + ) + self.assertEqual(result["transcription"], "Olá mundo") self.assertEqual(result["summary"], "Resumo do áudio.") - self.assertEqual(result["content_type"], "podcast") self.assertIsNotNone(result["transcription_path"]) self.assertIsNotNone(result["summary_path"]) - self.assertEqual(Path(result["transcription_path"]).name, "sample - transcricao.txt") - self.assertEqual(Path(result["summary_path"]).name, "sample - sumario.txt") - self.assertEqual(result["result_path"], result["summary_path"]) self.assertTrue(Path(result["transcription_path"]).exists()) self.assertTrue(Path(result["summary_path"]).exists()) - def test_text_summarize_generates_summary_file_legacy(self): + def test_text_summarize_generates_summary_only(self): text_path = self.temp_dir / "article.txt" text_path.write_text("This is a long article in English.", encoding="utf-8") @@ -76,9 +110,6 @@ class ProcessingTests(unittest.TestCase): ), patch( "lazier.core.processing.summarize_text", return_value="Resumo em português.", - ), patch( - "lazier.core.processing.detect_content_type", - return_value={"content_type": "tech_doc", "confidence": 0.7, "rationale": ""}, ): result = process_source( str(text_path), @@ -92,15 +123,12 @@ class ProcessingTests(unittest.TestCase): ) self.assertEqual(result["summary"], "Resumo em português.") - self.assertEqual(result["transcription"], "Este é um artigo longo em português.") - self.assertEqual(result["content_type"], "tech_doc") + self.assertIsNone(result["transcription"]) + self.assertEqual(result["content_type"], "other") self.assertIsNone(result["smart_summary"]) - self.assertIsNotNone(result["transcription_path"]) + self.assertIsNone(result["transcription_path"]) self.assertIsNotNone(result["summary_path"]) - self.assertEqual(Path(result["summary_path"]).name, "article - sumario.txt") - self.assertEqual(Path(result["transcription_path"]).name, "article - transcricao.txt") self.assertEqual(result["result_path"], result["summary_path"]) - self.assertTrue(Path(result["transcription_path"]).exists()) self.assertTrue(Path(result["summary_path"]).exists()) def test_text_summarize_uses_smart_summary_when_enabled(self): @@ -123,9 +151,6 @@ class ProcessingTests(unittest.TestCase): "lazier.core.processing.render_text_in_portuguese", return_value="Texto convertido para portugues", ), patch( - "lazier.core.processing.detect_content_type", - return_value={"content_type": "lecture", "confidence": 0.85, "rationale": ""}, - ), patch( "lazier.core.processing.summarize_smart", return_value=smart_payload, ) as mock_smart, patch( @@ -145,11 +170,10 @@ class ProcessingTests(unittest.TestCase): mock_smart.assert_called_once() mock_legacy.assert_not_called() self.assertEqual(result["smart_summary"], smart_payload) - self.assertEqual(result["content_type"], "lecture") + self.assertEqual(result["content_type"], "other") self.assertIn("Resumo curto", result["summary"]) - self.assertIsNotNone(result["transcription_path"]) + self.assertIsNone(result["transcription_path"]) self.assertIsNotNone(result["summary_path"]) - self.assertEqual(Path(result["summary_path"]).name, "smart - sumario.md") self.assertEqual(result["result_path"], result["summary_path"]) diff --git a/tests/test_smart_summary.py b/tests/test_smart_summary.py @@ -19,6 +19,7 @@ def _fake_summary_config(): cfg.summary_chunk_overlap_chars = 10 cfg.chat_model = "gpt-5-mini" cfg.reasoning_effort = "medium" + cfg.summary_parallel_workers = 1 cfg.supports_reasoning = MagicMock(return_value=True) return cfg