lazier

personal summarizer
Log | Files | Refs | README

commit b1738d8d282ca60a93ad65b88b9f0959537ad129
parent 30c86b97c78ea073da753b26ce14c3aa9ddc89bc
Author: Pablo Murad <pblmrd@gmail.com>
Date:   Thu, 21 May 2026 21:28:55 -0300

ESMOL CHANGES

Diffstat:
MREADME.md | 4+++-
Mlazier/api/routes.py | 29+++++++++++++++++++++++------
Mlazier/core/jobs.py | 12+++++++++++-
Mlazier/core/processing.py | 4++++
Mlazier/downloader.py | 58+++++++++++++++++++++++++++++++++++++++-------------------
Mlazier/utils.py | 88++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Atests/test_export_filenames.py | 58++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtests/test_jobs.py | 3++-
Atests/test_music_detection.py | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtests/test_processing.py | 9+++++----
10 files changed, 291 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md @@ -174,6 +174,8 @@ Para áudio/vídeo, quando `OPENAI_ENABLE_CHAPTERS=true`, o Lazier: Os capítulos aparecem nos exports DOCX/Markdown/PDF/TXT e no JSON cru. +Os ficheiros exportados usam o **título do vídeo ou do conteúdo** (sanitizado) como nome e como cabeçalho do documento. Quando há transcrição e sumário em ficheiros separados, os sufixos `- transcricao` e `- sumario` distinguem cada artefato. + ## Sites suportados (vídeo/áudio) Além do YouTube, você pode colar URLs de vídeo ou áudio de **centenas de sites**. O Lazier usa o [yt-dlp](https://github.com/yt-dlp/yt-dlp) para extrair o áudio; se a URL não for um vídeo, o sistema tenta extrair o texto da página e sumarizar. @@ -198,7 +200,7 @@ Além do YouTube, você pode colar URLs de vídeo ou áudio de **centenas de sit E muitos outros. **Lista completa** mantida pelo yt-dlp: [Supported sites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md). -**Importante:** Conteúdo detectado como **música** (ex.: categoria Music no YouTube, domínios só de música) **não é processado** pelo Lazier. +**Importante:** Conteúdo cujo **tema é música** (categoria Music no YouTube, tags de clipe oficial, domínios só de música) **não é processado** pelo Lazier. Vídeos com **música de fundo** (vlogs, palestras, tutoriais) continuam sendo processados normalmente. ## Docker diff --git a/lazier/api/routes.py b/lazier/api/routes.py @@ -17,11 +17,11 @@ from pydantic import BaseModel from ..audio_processor import extract_audio_from_video from ..core.formats import export -from ..core.jobs import build_job_artifact_path, get_job_store, lazier_download_filename +from ..core.jobs import build_job_artifact_path, get_job_store from ..core.processing import process_source from ..core.supported_sites import SUPPORTED_VIDEO_SITES from ..core.playlist import is_playlist_url -from ..utils import VIDEO_EXTENSIONS, is_upload_extension_allowed +from ..utils import VIDEO_EXTENSIONS, build_export_filename, is_upload_extension_allowed from .websocket import broadcast_progress logger = logging.getLogger(__name__) @@ -72,6 +72,15 @@ def _resolve_mode( return "process" +def _download_filename(job: dict, artifact_kind: str) -> str: + return build_export_filename( + job.get("metadata", {}), + job.get("format", "docx"), + source_name=job.get("source_name"), + artifact_kind=artifact_kind, + ) + + def _job_title(job: dict) -> str: metadata = job.get("metadata", {}) if metadata.get("title"): @@ -236,6 +245,7 @@ def _ensure_download_file(job: dict, artifact_kind: str) -> Optional[str]: format_type=job.get("format", "docx"), artifact_kind="transcription", created_at=job.get("created_at"), + metadata=job.get("metadata", {}), ) export( transcription=job["transcription"], @@ -254,6 +264,7 @@ def _ensure_download_file(job: dict, artifact_kind: str) -> Optional[str]: format_type=job.get("format", "docx"), artifact_kind="summary", created_at=job.get("created_at"), + metadata=job.get("metadata", {}), ) export( transcription="", @@ -453,7 +464,7 @@ async def download_transcription(job_id: str): if not download_path: raise HTTPException(status_code=404, detail="Transcricao nao disponivel") - filename = Path(download_path).name + filename = _download_filename(job, "transcription") return FileResponse(download_path, media_type="application/octet-stream", filename=filename) @@ -469,7 +480,7 @@ async def download_summary(job_id: str): if not download_path: raise HTTPException(status_code=404, detail="Sumario nao disponivel") - filename = Path(download_path).name + filename = _download_filename(job, "summary") return FileResponse(download_path, media_type="application/octet-stream", filename=filename) @@ -485,7 +496,8 @@ async def download_result(job_id: str): if not download_path: raise HTTPException(status_code=404, detail="Arquivo de resultado nao encontrado") - filename = Path(download_path).name + artifact_kind = "transcription" if job.get("mode") == "transcribe" else "summary" + filename = _download_filename(job, artifact_kind) return FileResponse(download_path, media_type="application/octet-stream", filename=filename) @@ -510,7 +522,12 @@ async def download_bundle(job_id: str, background_tasks: BackgroundTasks): tmp_path.unlink(missing_ok=True) raise - zip_filename = lazier_download_filename("zip") + zip_filename = build_export_filename( + job.get("metadata", {}), + "zip", + source_name=job.get("source_name"), + artifact_kind="result", + ) background_tasks.add_task(_unlink_quiet, str(tmp_path)) return FileResponse(str(tmp_path), media_type="application/zip", filename=zip_filename) diff --git a/lazier/core/jobs.py b/lazier/core/jobs.py @@ -13,6 +13,8 @@ from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional, Union +from ..utils import build_export_filename + TERMINAL_STATUSES = {"completed", "failed", "interrupted"} ARTIFACT_KINDS = frozenset({"transcription", "summary", "result"}) @@ -75,6 +77,7 @@ def build_job_artifact_path( artifact_kind: str, created_at: Optional[Union[str, datetime]] = None, output_root: Optional[Path] = None, + metadata: Optional[Dict[str, Any]] = None, ) -> Path: if artifact_kind not in ARTIFACT_KINDS: raise ValueError(f"Artefato nao suportado: {artifact_kind}") @@ -86,7 +89,14 @@ def build_job_artifact_path( output_root=output_root, ) output_dir.mkdir(parents=True, exist_ok=True) - return output_dir / lazier_download_filename(format_type) + filename = build_export_filename( + metadata, + format_type, + source_name=source_name, + artifact_kind=artifact_kind, + output_dir=output_dir, + ) + return output_dir / filename class JobStore: diff --git a/lazier/core/processing.py b/lazier/core/processing.py @@ -729,6 +729,7 @@ def process_source( artifact_kind="transcription", created_at=created_at, output_root=output_root, + metadata=metadata, ) sm_path = build_job_artifact_path( job_id=run_id, @@ -737,6 +738,7 @@ def process_source( artifact_kind="summary", created_at=created_at, output_root=output_root, + metadata=metadata, ) export( transcription=portuguese_text or "", @@ -763,6 +765,7 @@ def process_source( artifact_kind="summary", created_at=created_at, output_root=output_root, + metadata=metadata, ) exported_path = export( transcription="", @@ -781,6 +784,7 @@ def process_source( artifact_kind="transcription", created_at=created_at, output_root=output_root, + metadata=metadata, ) exported_path = export( transcription=portuguese_text or "", diff --git a/lazier/downloader.py b/lazier/downloader.py @@ -38,10 +38,13 @@ MUSIC_ONLY_DOMAINS = frozenset({ 'jiosaavn.com', 'gaana.com', 'wynk.in', 'hungama.com', 'yandexmusic.ru', }) -# Valores de category/genre/tags que indicam conteúdo de música -MUSIC_CATEGORY_KEYWORDS = frozenset({ - 'music', 'música', 'musica', 'music video', 'music video clip', - 'song', 'canção', 'cancao', 'album', 'single', 'mv', 'clip', +# Categoria/gênero YouTube (ou equivalente) que indica conteúdo cujo tema é música +MUSIC_CATEGORY_EXACT = frozenset({'music', 'música', 'musica'}) + +# Frases em tags que indicam clipe/áudio oficial (não música de fundo em vlogs) +MUSIC_TAG_PHRASES = frozenset({ + 'music video', 'music video clip', 'official music video', + 'official audio', 'lyrics video', 'lyric video', }) @@ -84,30 +87,47 @@ def is_music_domain(url: str) -> bool: return False +def _normalize_category_list(info: Dict[str, Any]) -> list[str]: + """Extrai lista de categorias a partir dos metadados do yt-dlp.""" + categories = info.get('categories') or [] + if isinstance(categories, str): + categories = [categories] + elif not isinstance(categories, list): + categories = [] + if not categories and info.get('category'): + cat = info['category'] + categories = [cat] if isinstance(cat, str) else list(cat) if cat else [] + return [str(c).strip().lower() for c in categories if c] + + def is_music_content(info: Dict[str, Any]) -> bool: """ - Verifica se os metadados do extract_info indicam conteúdo de música. + Verifica se os metadados do extract_info indicam conteúdo cujo tema é música. Usado após extract_info (yt-dlp) para recusar processamento de música. + + Vídeos com trilha de fundo (tags como "background music") não são bloqueados; + apenas categoria Music/Música, gênero equivalente ou tags de clipe oficial. """ if not info: return False - category = (info.get('categories') or []) if isinstance(info.get('categories'), list) else [] - if not category and info.get('category'): - category = [info['category']] - genre = (info.get('genre') or '') if isinstance(info.get('genre'), str) else '' + + for cat in _normalize_category_list(info): + if cat in MUSIC_CATEGORY_EXACT: + return True + + genre = info.get('genre') or '' + if isinstance(genre, str) and genre.strip().lower() in MUSIC_CATEGORY_EXACT: + return True + tags = info.get('tags') or [] if isinstance(tags, str): tags = [tags] - # YouTube: category id ou category string - yt_category = (info.get('categories') or [info.get('category')] if info.get('category') else []) - if isinstance(yt_category, str): - yt_category = [yt_category] - combined = ' '.join( - str(x).lower() for x in - (category + [genre] + list(tags)[:20] + list(yt_category)) - if x - ) - return any(kw in combined for kw in MUSIC_CATEGORY_KEYWORDS) + for tag in list(tags)[:20]: + tag_lower = str(tag).lower() + if any(phrase in tag_lower for phrase in MUSIC_TAG_PHRASES): + return True + + return False def _classify_youtube_error(error: Exception) -> Tuple[Type[YouTubeDownloadError], bool]: diff --git a/lazier/utils.py b/lazier/utils.py @@ -4,12 +4,13 @@ Utilitários para validação, limpeza e verificação de dependências import os import re +import secrets import shutil import threading import time import unicodedata from pathlib import Path -from typing import Optional, Tuple +from typing import Any, Dict, Optional, Tuple from urllib.parse import urlparse @@ -156,6 +157,91 @@ def sanitize_filename(filename: str) -> str: return filename.strip() +_EMPTY_TITLE_VALUES = frozenset({"", "sem título", "sem titulo"}) + +_ARTIFACT_FILENAME_SUFFIX = { + "transcription": " - transcricao", + "summary": " - sumario", + "result": "", +} + + +def _title_usable_for_filename( + metadata: Optional[Dict[str, Any]] = None, + source_name: Optional[str] = None, +) -> bool: + meta = metadata or {} + title = meta.get("title") + if title and str(title).strip().lower() not in _EMPTY_TITLE_VALUES: + return True + if source_name and not str(source_name).startswith(("http://", "https://")): + if Path(source_name).stem: + return True + return False + + +def resolve_export_title( + metadata: Optional[Dict[str, Any]] = None, + source_name: Optional[str] = None, + fallback: str = "Transcricao", +) -> str: + """Titulo para documento e base do nome de ficheiro exportado.""" + meta = metadata or {} + title = meta.get("title") + if title and str(title).strip().lower() not in _EMPTY_TITLE_VALUES: + return str(title).strip() + if source_name and not str(source_name).startswith(("http://", "https://")): + stem = Path(source_name).stem + if stem: + return stem + return fallback + + +def _lazier_fallback_filename(format_type: str, artifact_suffix: str = "") -> str: + n = secrets.randbelow(900_000) + 100_000 + ext = format_type.lstrip(".") + return f"lazier_{n}{artifact_suffix}.{ext}" + + +def build_export_filename( + metadata: Optional[Dict[str, Any]] = None, + format_type: str = "docx", + *, + source_name: Optional[str] = None, + artifact_kind: Optional[str] = None, + output_dir: Optional[Path] = None, +) -> str: + """ + Nome de ficheiro para exportacao baseado no titulo do video/conteudo. + Usa fallback lazier_<6 digitos> quando nao ha titulo utilizavel. + """ + ext = format_type.lstrip(".") + artifact_suffix = _ARTIFACT_FILENAME_SUFFIX.get(artifact_kind or "result", "") + + if _title_usable_for_filename(metadata, source_name): + base = sanitize_filename(resolve_export_title(metadata, source_name)) + name = f"{base}{artifact_suffix}.{ext}" + else: + name = _lazier_fallback_filename(format_type, artifact_suffix) + + if output_dir is None: + return name + + out = Path(output_dir) + path = out / name + if not path.exists(): + return name + + stem = path.stem + suffix = path.suffix + counter = 2 + while True: + candidate = f"{stem}-{counter}{suffix}" + if not (out / candidate).exists(): + return candidate + counter += 1 + + def sanitize_xml_string(text: str) -> str: """ Remove caracteres inválidos para XML, mantendo apenas caracteres válidos diff --git a/tests/test_export_filenames.py b/tests/test_export_filenames.py @@ -0,0 +1,58 @@ +import os +import unittest +from pathlib import Path + +from lazier.utils import ( + build_export_filename, + resolve_export_title, + sanitize_filename, +) + + +class ExportFilenameTests(unittest.TestCase): + def setUp(self): + self.temp_dir = Path(os.getcwd()) / ".tmp-tests" / "export-filenames" + self.temp_dir.mkdir(parents=True, exist_ok=True) + + def tearDown(self): + for f in self.temp_dir.glob("*"): + f.unlink(missing_ok=True) + + def test_resolve_title_from_metadata(self): + title = resolve_export_title({"title": "Meu Video Legal"}, source_name="https://youtube.com/watch?v=x") + self.assertEqual(title, "Meu Video Legal") + + def test_resolve_title_from_source_name_when_no_metadata_title(self): + title = resolve_export_title({}, source_name="palestra.mp3") + self.assertEqual(title, "palestra") + + def test_video_title_becomes_docx_filename(self): + name = build_export_filename( + {"title": "Meu Video"}, + "docx", + source_name="https://youtube.com/watch?v=abc", + ) + self.assertEqual(name, f"{sanitize_filename('Meu Video')}.docx") + + def test_dual_artifacts_use_transcricao_and_sumario_suffixes(self): + meta = {"title": "Palestra IA"} + tx = build_export_filename(meta, "docx", artifact_kind="transcription") + sm = build_export_filename(meta, "docx", artifact_kind="summary") + self.assertTrue(tx.endswith(" - transcricao.docx")) + self.assertTrue(sm.endswith(" - sumario.docx")) + self.assertNotEqual(tx, sm) + + def test_collision_appends_numeric_suffix(self): + meta = {"title": "Duplicado"} + first = build_export_filename(meta, "txt", output_dir=self.temp_dir) + (self.temp_dir / first).write_text("x", encoding="utf-8") + second = build_export_filename(meta, "txt", output_dir=self.temp_dir) + self.assertEqual(second, f"{sanitize_filename('Duplicado')}-2.txt") + + def test_no_title_falls_back_to_lazier_pattern(self): + name = build_export_filename({}, "docx", source_name="https://example.com/v") + self.assertRegex(name, r"^lazier_\d{6}\.docx$") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_jobs.py b/tests/test_jobs.py @@ -55,10 +55,11 @@ class JobStoreTests(unittest.TestCase): artifact_kind="summary", created_at=created_at, output_root=self.temp_dir, + metadata={"title": "Minha Reunião"}, ) self.assertIn("2026", str(output_dir)) self.assertIn("03", str(output_dir)) self.assertIn("31", str(output_dir)) self.assertTrue(output_dir.name.startswith("minha-reuniao-abc12345")) - self.assertRegex(summary_path.name, r"^lazier_\d{6}\.txt$") + self.assertEqual(summary_path.name, "Minha Reunião - sumario.txt") diff --git a/tests/test_music_detection.py b/tests/test_music_detection.py @@ -0,0 +1,59 @@ +import unittest + +from lazier.downloader import is_music_content, is_music_domain + + +class MusicDetectionTests(unittest.TestCase): + def test_vlog_with_background_music_tag_not_blocked(self): + info = { + 'category': 'Education', + 'tags': ['background music', 'vlog', 'tutorial'], + } + self.assertFalse(is_music_content(info)) + + def test_podcast_category_not_blocked(self): + info = { + 'category': 'People & Blogs', + 'tags': ['podcast', 'interview'], + } + self.assertFalse(is_music_content(info)) + + def test_music_category_blocked(self): + info = {'category': 'Music', 'tags': ['pop', '2024']} + self.assertTrue(is_music_content(info)) + + def test_music_category_lowercase_blocked(self): + info = {'categories': ['music']} + self.assertTrue(is_music_content(info)) + + def test_official_music_video_tag_blocked(self): + info = { + 'category': 'Entertainment', + 'tags': ['official music video', 'artist name'], + } + self.assertTrue(is_music_content(info)) + + def test_lyrics_video_tag_blocked(self): + info = { + 'category': 'Entertainment', + 'tags': ['lyrics video'], + } + self.assertTrue(is_music_content(info)) + + def test_genre_music_blocked(self): + info = {'category': 'Entertainment', 'genre': 'Música'} + self.assertTrue(is_music_content(info)) + + def test_empty_info_not_blocked(self): + self.assertFalse(is_music_content({})) + self.assertFalse(is_music_content(None)) + + def test_spotify_domain_blocked(self): + self.assertTrue(is_music_domain('https://open.spotify.com/track/abc')) + + def test_youtube_domain_not_blocked_by_domain_check(self): + self.assertFalse(is_music_domain('https://www.youtube.com/watch?v=abc')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_processing.py b/tests/test_processing.py @@ -58,8 +58,8 @@ class ProcessingTests(unittest.TestCase): self.assertEqual(result["content_type"], "podcast") self.assertIsNotNone(result["transcription_path"]) self.assertIsNotNone(result["summary_path"]) - self.assertRegex(Path(result["transcription_path"]).name, r"^lazier_\d{6}\.txt$") - self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.txt$") + self.assertEqual(Path(result["transcription_path"]).name, "sample - transcricao.txt") + self.assertEqual(Path(result["summary_path"]).name, "sample - sumario.txt") self.assertEqual(result["result_path"], result["summary_path"]) self.assertTrue(Path(result["transcription_path"]).exists()) self.assertTrue(Path(result["summary_path"]).exists()) @@ -97,7 +97,8 @@ class ProcessingTests(unittest.TestCase): self.assertIsNone(result["smart_summary"]) self.assertIsNotNone(result["transcription_path"]) self.assertIsNotNone(result["summary_path"]) - self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.txt$") + self.assertEqual(Path(result["summary_path"]).name, "article - sumario.txt") + self.assertEqual(Path(result["transcription_path"]).name, "article - transcricao.txt") self.assertEqual(result["result_path"], result["summary_path"]) self.assertTrue(Path(result["transcription_path"]).exists()) self.assertTrue(Path(result["summary_path"]).exists()) @@ -148,7 +149,7 @@ class ProcessingTests(unittest.TestCase): self.assertIn("Resumo curto", result["summary"]) self.assertIsNotNone(result["transcription_path"]) self.assertIsNotNone(result["summary_path"]) - self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.md$") + self.assertEqual(Path(result["summary_path"]).name, "smart - sumario.md") self.assertEqual(result["result_path"], result["summary_path"])