ESMOL CHANGES - lazier - personal summarizer

commit b1738d8d282ca60a93ad65b88b9f0959537ad129
parent 30c86b97c78ea073da753b26ce14c3aa9ddc89bc
Author: Pablo Murad <pblmrd@gmail.com>
Date:   Thu, 21 May 2026 21:28:55 -0300

ESMOL CHANGES

Diffstat:
M README.md  | 4 +++-
M lazier/api/routes.py  | 29 +++++++++++++++++++++++------
M lazier/core/jobs.py  | 12 +++++++++++-
M lazier/core/processing.py  | 4 ++++
M lazier/downloader.py  | 58 +++++++++++++++++++++++++++++++++++++++-------------------
M lazier/utils.py  | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
A tests/test_export_filenames.py  | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M tests/test_jobs.py  | 3 ++-
A tests/test_music_detection.py  | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M tests/test_processing.py  | 9 +++++----

10 files changed, 291 insertions(+), 33 deletions(-)
diff --git a/README.md b/README.md
@@ -174,6 +174,8 @@ Para áudio/vídeo, quando `OPENAI_ENABLE_CHAPTERS=true`, o Lazier:
 
 Os capítulos aparecem nos exports DOCX/Markdown/PDF/TXT e no JSON cru.
 
+Os ficheiros exportados usam o **título do vídeo ou do conteúdo** (sanitizado) como nome e como cabeçalho do documento. Quando há transcrição e sumário em ficheiros separados, os sufixos `- transcricao` e `- sumario` distinguem cada artefato.
+
 ## Sites suportados (vídeo/áudio)
 
 Além do YouTube, você pode colar URLs de vídeo ou áudio de **centenas de sites**. O Lazier usa o [yt-dlp](https://github.com/yt-dlp/yt-dlp) para extrair o áudio; se a URL não for um vídeo, o sistema tenta extrair o texto da página e sumarizar.
@@ -198,7 +200,7 @@ Além do YouTube, você pode colar URLs de vídeo ou áudio de **centenas de sit
 
 E muitos outros. **Lista completa** mantida pelo yt-dlp: [Supported sites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md).
 
-**Importante:** Conteúdo detectado como **música** (ex.: categoria Music no YouTube, domínios só de música) **não é processado** pelo Lazier.
+**Importante:** Conteúdo cujo **tema é música** (categoria Music no YouTube, tags de clipe oficial, domínios só de música) **não é processado** pelo Lazier. Vídeos com **música de fundo** (vlogs, palestras, tutoriais) continuam sendo processados normalmente.
 
 ## Docker
 
diff --git a/lazier/api/routes.py b/lazier/api/routes.py
@@ -17,11 +17,11 @@ from pydantic import BaseModel
 
 from ..audio_processor import extract_audio_from_video
 from ..core.formats import export
-from ..core.jobs import build_job_artifact_path, get_job_store, lazier_download_filename
+from ..core.jobs import build_job_artifact_path, get_job_store
 from ..core.processing import process_source
 from ..core.supported_sites import SUPPORTED_VIDEO_SITES
 from ..core.playlist import is_playlist_url
-from ..utils import VIDEO_EXTENSIONS, is_upload_extension_allowed
+from ..utils import VIDEO_EXTENSIONS, build_export_filename, is_upload_extension_allowed
 from .websocket import broadcast_progress
 
 logger = logging.getLogger(__name__)
@@ -72,6 +72,15 @@ def _resolve_mode(
     return "process"
 
 
+def _download_filename(job: dict, artifact_kind: str) -> str:
+    return build_export_filename(
+        job.get("metadata", {}),
+        job.get("format", "docx"),
+        source_name=job.get("source_name"),
+        artifact_kind=artifact_kind,
+    )
+
+
 def _job_title(job: dict) -> str:
     metadata = job.get("metadata", {})
     if metadata.get("title"):
@@ -236,6 +245,7 @@ def _ensure_download_file(job: dict, artifact_kind: str) -> Optional[str]:
             format_type=job.get("format", "docx"),
             artifact_kind="transcription",
             created_at=job.get("created_at"),
+            metadata=job.get("metadata", {}),
         )
         export(
             transcription=job["transcription"],
@@ -254,6 +264,7 @@ def _ensure_download_file(job: dict, artifact_kind: str) -> Optional[str]:
             format_type=job.get("format", "docx"),
             artifact_kind="summary",
             created_at=job.get("created_at"),
+            metadata=job.get("metadata", {}),
         )
         export(
             transcription="",
@@ -453,7 +464,7 @@ async def download_transcription(job_id: str):
     if not download_path:
         raise HTTPException(status_code=404, detail="Transcricao nao disponivel")
 
-    filename = Path(download_path).name
+    filename = _download_filename(job, "transcription")
     return FileResponse(download_path, media_type="application/octet-stream", filename=filename)
 
 
@@ -469,7 +480,7 @@ async def download_summary(job_id: str):
     if not download_path:
         raise HTTPException(status_code=404, detail="Sumario nao disponivel")
 
-    filename = Path(download_path).name
+    filename = _download_filename(job, "summary")
     return FileResponse(download_path, media_type="application/octet-stream", filename=filename)
 
 
@@ -485,7 +496,8 @@ async def download_result(job_id: str):
     if not download_path:
         raise HTTPException(status_code=404, detail="Arquivo de resultado nao encontrado")
 
-    filename = Path(download_path).name
+    artifact_kind = "transcription" if job.get("mode") == "transcribe" else "summary"
+    filename = _download_filename(job, artifact_kind)
     return FileResponse(download_path, media_type="application/octet-stream", filename=filename)
 
 
@@ -510,7 +522,12 @@ async def download_bundle(job_id: str, background_tasks: BackgroundTasks):
         tmp_path.unlink(missing_ok=True)
         raise
 
-    zip_filename = lazier_download_filename("zip")
+    zip_filename = build_export_filename(
+        job.get("metadata", {}),
+        "zip",
+        source_name=job.get("source_name"),
+        artifact_kind="result",
+    )
     background_tasks.add_task(_unlink_quiet, str(tmp_path))
     return FileResponse(str(tmp_path), media_type="application/zip", filename=zip_filename)
 
diff --git a/lazier/core/jobs.py b/lazier/core/jobs.py
@@ -13,6 +13,8 @@ from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+from ..utils import build_export_filename
+
 
 TERMINAL_STATUSES = {"completed", "failed", "interrupted"}
 ARTIFACT_KINDS = frozenset({"transcription", "summary", "result"})
@@ -75,6 +77,7 @@ def build_job_artifact_path(
     artifact_kind: str,
     created_at: Optional[Union[str, datetime]] = None,
     output_root: Optional[Path] = None,
+    metadata: Optional[Dict[str, Any]] = None,
 ) -> Path:
     if artifact_kind not in ARTIFACT_KINDS:
         raise ValueError(f"Artefato nao suportado: {artifact_kind}")
@@ -86,7 +89,14 @@ def build_job_artifact_path(
         output_root=output_root,
     )
     output_dir.mkdir(parents=True, exist_ok=True)
-    return output_dir / lazier_download_filename(format_type)
+    filename = build_export_filename(
+        metadata,
+        format_type,
+        source_name=source_name,
+        artifact_kind=artifact_kind,
+        output_dir=output_dir,
+    )
+    return output_dir / filename
 
 
 class JobStore:
diff --git a/lazier/core/processing.py b/lazier/core/processing.py
@@ -729,6 +729,7 @@ def process_source(
                 artifact_kind="transcription",
                 created_at=created_at,
                 output_root=output_root,
+                metadata=metadata,
             )
             sm_path = build_job_artifact_path(
                 job_id=run_id,
@@ -737,6 +738,7 @@ def process_source(
                 artifact_kind="summary",
                 created_at=created_at,
                 output_root=output_root,
+                metadata=metadata,
             )
             export(
                 transcription=portuguese_text or "",
@@ -763,6 +765,7 @@ def process_source(
                 artifact_kind="summary",
                 created_at=created_at,
                 output_root=output_root,
+                metadata=metadata,
             )
             exported_path = export(
                 transcription="",
@@ -781,6 +784,7 @@ def process_source(
                 artifact_kind="transcription",
                 created_at=created_at,
                 output_root=output_root,
+                metadata=metadata,
             )
             exported_path = export(
                 transcription=portuguese_text or "",
diff --git a/lazier/downloader.py b/lazier/downloader.py
@@ -38,10 +38,13 @@ MUSIC_ONLY_DOMAINS = frozenset({
     'jiosaavn.com', 'gaana.com', 'wynk.in', 'hungama.com', 'yandexmusic.ru',
 })
 
-# Valores de category/genre/tags que indicam conteúdo de música
-MUSIC_CATEGORY_KEYWORDS = frozenset({
-    'music', 'música', 'musica', 'music video', 'music video clip',
-    'song', 'canção', 'cancao', 'album', 'single', 'mv', 'clip',
+# Categoria/gênero YouTube (ou equivalente) que indica conteúdo cujo tema é música
+MUSIC_CATEGORY_EXACT = frozenset({'music', 'música', 'musica'})
+
+# Frases em tags que indicam clipe/áudio oficial (não música de fundo em vlogs)
+MUSIC_TAG_PHRASES = frozenset({
+    'music video', 'music video clip', 'official music video',
+    'official audio', 'lyrics video', 'lyric video',
 })
 
 
@@ -84,30 +87,47 @@ def is_music_domain(url: str) -> bool:
         return False
 
 
+def _normalize_category_list(info: Dict[str, Any]) -> list[str]:
+    """Extrai lista de categorias a partir dos metadados do yt-dlp."""
+    categories = info.get('categories') or []
+    if isinstance(categories, str):
+        categories = [categories]
+    elif not isinstance(categories, list):
+        categories = []
+    if not categories and info.get('category'):
+        cat = info['category']
+        categories = [cat] if isinstance(cat, str) else list(cat) if cat else []
+    return [str(c).strip().lower() for c in categories if c]
+
+
 def is_music_content(info: Dict[str, Any]) -> bool:
     """
-    Verifica se os metadados do extract_info indicam conteúdo de música.
+    Verifica se os metadados do extract_info indicam conteúdo cujo tema é música.
     Usado após extract_info (yt-dlp) para recusar processamento de música.
+
+    Vídeos com trilha de fundo (tags como "background music") não são bloqueados;
+    apenas categoria Music/Música, gênero equivalente ou tags de clipe oficial.
     """
     if not info:
         return False
-    category = (info.get('categories') or []) if isinstance(info.get('categories'), list) else []
-    if not category and info.get('category'):
-        category = [info['category']]
-    genre = (info.get('genre') or '') if isinstance(info.get('genre'), str) else ''
+
+    for cat in _normalize_category_list(info):
+        if cat in MUSIC_CATEGORY_EXACT:
+            return True
+
+    genre = info.get('genre') or ''
+    if isinstance(genre, str) and genre.strip().lower() in MUSIC_CATEGORY_EXACT:
+        return True
+
     tags = info.get('tags') or []
     if isinstance(tags, str):
         tags = [tags]
-    # YouTube: category id ou category string
-    yt_category = (info.get('categories') or [info.get('category')] if info.get('category') else [])
-    if isinstance(yt_category, str):
-        yt_category = [yt_category]
-    combined = ' '.join(
-        str(x).lower() for x in
-        (category + [genre] + list(tags)[:20] + list(yt_category))
-        if x
-    )
-    return any(kw in combined for kw in MUSIC_CATEGORY_KEYWORDS)
+    for tag in list(tags)[:20]:
+        tag_lower = str(tag).lower()
+        if any(phrase in tag_lower for phrase in MUSIC_TAG_PHRASES):
+            return True
+
+    return False
 
 
 def _classify_youtube_error(error: Exception) -> Tuple[Type[YouTubeDownloadError], bool]:
diff --git a/lazier/utils.py b/lazier/utils.py
@@ -4,12 +4,13 @@ Utilitários para validação, limpeza e verificação de dependências
 
 import os
 import re
+import secrets
 import shutil
 import threading
 import time
 import unicodedata
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 from urllib.parse import urlparse
 
 
@@ -156,6 +157,91 @@ def sanitize_filename(filename: str) -> str:
     return filename.strip()
 
 
+_EMPTY_TITLE_VALUES = frozenset({"", "sem título", "sem titulo"})
+
+_ARTIFACT_FILENAME_SUFFIX = {
+    "transcription": " - transcricao",
+    "summary": " - sumario",
+    "result": "",
+}
+
+
+def _title_usable_for_filename(
+    metadata: Optional[Dict[str, Any]] = None,
+    source_name: Optional[str] = None,
+) -> bool:
+    meta = metadata or {}
+    title = meta.get("title")
+    if title and str(title).strip().lower() not in _EMPTY_TITLE_VALUES:
+        return True
+    if source_name and not str(source_name).startswith(("http://", "https://")):
+        if Path(source_name).stem:
+            return True
+    return False
+
+
+def resolve_export_title(
+    metadata: Optional[Dict[str, Any]] = None,
+    source_name: Optional[str] = None,
+    fallback: str = "Transcricao",
+) -> str:
+    """Titulo para documento e base do nome de ficheiro exportado."""
+    meta = metadata or {}
+    title = meta.get("title")
+    if title and str(title).strip().lower() not in _EMPTY_TITLE_VALUES:
+        return str(title).strip()
+    if source_name and not str(source_name).startswith(("http://", "https://")):
+        stem = Path(source_name).stem
+        if stem:
+            return stem
+    return fallback
+
+
+def _lazier_fallback_filename(format_type: str, artifact_suffix: str = "") -> str:
+    n = secrets.randbelow(900_000) + 100_000
+    ext = format_type.lstrip(".")
+    return f"lazier_{n}{artifact_suffix}.{ext}"
+
+
+def build_export_filename(
+    metadata: Optional[Dict[str, Any]] = None,
+    format_type: str = "docx",
+    *,
+    source_name: Optional[str] = None,
+    artifact_kind: Optional[str] = None,
+    output_dir: Optional[Path] = None,
+) -> str:
+    """
+    Nome de ficheiro para exportacao baseado no titulo do video/conteudo.
+    Usa fallback lazier_<6 digitos> quando nao ha titulo utilizavel.
+    """
+    ext = format_type.lstrip(".")
+    artifact_suffix = _ARTIFACT_FILENAME_SUFFIX.get(artifact_kind or "result", "")
+
+    if _title_usable_for_filename(metadata, source_name):
+        base = sanitize_filename(resolve_export_title(metadata, source_name))
+        name = f"{base}{artifact_suffix}.{ext}"
+    else:
+        name = _lazier_fallback_filename(format_type, artifact_suffix)
+
+    if output_dir is None:
+        return name
+
+    out = Path(output_dir)
+    path = out / name
+    if not path.exists():
+        return name
+
+    stem = path.stem
+    suffix = path.suffix
+    counter = 2
+    while True:
+        candidate = f"{stem}-{counter}{suffix}"
+        if not (out / candidate).exists():
+            return candidate
+        counter += 1
+
+
 def sanitize_xml_string(text: str) -> str:
     """
     Remove caracteres inválidos para XML, mantendo apenas caracteres válidos
diff --git a/tests/test_export_filenames.py b/tests/test_export_filenames.py
@@ -0,0 +1,58 @@
+import os
+import unittest
+from pathlib import Path
+
+from lazier.utils import (
+    build_export_filename,
+    resolve_export_title,
+    sanitize_filename,
+)
+
+
+class ExportFilenameTests(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = Path(os.getcwd()) / ".tmp-tests" / "export-filenames"
+        self.temp_dir.mkdir(parents=True, exist_ok=True)
+
+    def tearDown(self):
+        for f in self.temp_dir.glob("*"):
+            f.unlink(missing_ok=True)
+
+    def test_resolve_title_from_metadata(self):
+        title = resolve_export_title({"title": "Meu Video Legal"}, source_name="https://youtube.com/watch?v=x")
+        self.assertEqual(title, "Meu Video Legal")
+
+    def test_resolve_title_from_source_name_when_no_metadata_title(self):
+        title = resolve_export_title({}, source_name="palestra.mp3")
+        self.assertEqual(title, "palestra")
+
+    def test_video_title_becomes_docx_filename(self):
+        name = build_export_filename(
+            {"title": "Meu Video"},
+            "docx",
+            source_name="https://youtube.com/watch?v=abc",
+        )
+        self.assertEqual(name, f"{sanitize_filename('Meu Video')}.docx")
+
+    def test_dual_artifacts_use_transcricao_and_sumario_suffixes(self):
+        meta = {"title": "Palestra IA"}
+        tx = build_export_filename(meta, "docx", artifact_kind="transcription")
+        sm = build_export_filename(meta, "docx", artifact_kind="summary")
+        self.assertTrue(tx.endswith(" - transcricao.docx"))
+        self.assertTrue(sm.endswith(" - sumario.docx"))
+        self.assertNotEqual(tx, sm)
+
+    def test_collision_appends_numeric_suffix(self):
+        meta = {"title": "Duplicado"}
+        first = build_export_filename(meta, "txt", output_dir=self.temp_dir)
+        (self.temp_dir / first).write_text("x", encoding="utf-8")
+        second = build_export_filename(meta, "txt", output_dir=self.temp_dir)
+        self.assertEqual(second, f"{sanitize_filename('Duplicado')}-2.txt")
+
+    def test_no_title_falls_back_to_lazier_pattern(self):
+        name = build_export_filename({}, "docx", source_name="https://example.com/v")
+        self.assertRegex(name, r"^lazier_\d{6}\.docx$")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_jobs.py b/tests/test_jobs.py
@@ -55,10 +55,11 @@ class JobStoreTests(unittest.TestCase):
             artifact_kind="summary",
             created_at=created_at,
             output_root=self.temp_dir,
+            metadata={"title": "Minha Reunião"},
         )
 
         self.assertIn("2026", str(output_dir))
         self.assertIn("03", str(output_dir))
         self.assertIn("31", str(output_dir))
         self.assertTrue(output_dir.name.startswith("minha-reuniao-abc12345"))
-        self.assertRegex(summary_path.name, r"^lazier_\d{6}\.txt$")
+        self.assertEqual(summary_path.name, "Minha Reunião - sumario.txt")
diff --git a/tests/test_music_detection.py b/tests/test_music_detection.py
@@ -0,0 +1,59 @@
+import unittest
+
+from lazier.downloader import is_music_content, is_music_domain
+
+
+class MusicDetectionTests(unittest.TestCase):
+    def test_vlog_with_background_music_tag_not_blocked(self):
+        info = {
+            'category': 'Education',
+            'tags': ['background music', 'vlog', 'tutorial'],
+        }
+        self.assertFalse(is_music_content(info))
+
+    def test_podcast_category_not_blocked(self):
+        info = {
+            'category': 'People & Blogs',
+            'tags': ['podcast', 'interview'],
+        }
+        self.assertFalse(is_music_content(info))
+
+    def test_music_category_blocked(self):
+        info = {'category': 'Music', 'tags': ['pop', '2024']}
+        self.assertTrue(is_music_content(info))
+
+    def test_music_category_lowercase_blocked(self):
+        info = {'categories': ['music']}
+        self.assertTrue(is_music_content(info))
+
+    def test_official_music_video_tag_blocked(self):
+        info = {
+            'category': 'Entertainment',
+            'tags': ['official music video', 'artist name'],
+        }
+        self.assertTrue(is_music_content(info))
+
+    def test_lyrics_video_tag_blocked(self):
+        info = {
+            'category': 'Entertainment',
+            'tags': ['lyrics video'],
+        }
+        self.assertTrue(is_music_content(info))
+
+    def test_genre_music_blocked(self):
+        info = {'category': 'Entertainment', 'genre': 'Música'}
+        self.assertTrue(is_music_content(info))
+
+    def test_empty_info_not_blocked(self):
+        self.assertFalse(is_music_content({}))
+        self.assertFalse(is_music_content(None))
+
+    def test_spotify_domain_blocked(self):
+        self.assertTrue(is_music_domain('https://open.spotify.com/track/abc'))
+
+    def test_youtube_domain_not_blocked_by_domain_check(self):
+        self.assertFalse(is_music_domain('https://www.youtube.com/watch?v=abc'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -58,8 +58,8 @@ class ProcessingTests(unittest.TestCase):
         self.assertEqual(result["content_type"], "podcast")
         self.assertIsNotNone(result["transcription_path"])
         self.assertIsNotNone(result["summary_path"])
-        self.assertRegex(Path(result["transcription_path"]).name, r"^lazier_\d{6}\.txt$")
-        self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.txt$")
+        self.assertEqual(Path(result["transcription_path"]).name, "sample - transcricao.txt")
+        self.assertEqual(Path(result["summary_path"]).name, "sample - sumario.txt")
         self.assertEqual(result["result_path"], result["summary_path"])
         self.assertTrue(Path(result["transcription_path"]).exists())
         self.assertTrue(Path(result["summary_path"]).exists())
@@ -97,7 +97,8 @@ class ProcessingTests(unittest.TestCase):
         self.assertIsNone(result["smart_summary"])
         self.assertIsNotNone(result["transcription_path"])
         self.assertIsNotNone(result["summary_path"])
-        self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.txt$")
+        self.assertEqual(Path(result["summary_path"]).name, "article - sumario.txt")
+        self.assertEqual(Path(result["transcription_path"]).name, "article - transcricao.txt")
         self.assertEqual(result["result_path"], result["summary_path"])
         self.assertTrue(Path(result["transcription_path"]).exists())
         self.assertTrue(Path(result["summary_path"]).exists())
@@ -148,7 +149,7 @@ class ProcessingTests(unittest.TestCase):
         self.assertIn("Resumo curto", result["summary"])
         self.assertIsNotNone(result["transcription_path"])
         self.assertIsNotNone(result["summary_path"])
-        self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.md$")
+        self.assertEqual(Path(result["summary_path"]).name, "smart - sumario.md")
         self.assertEqual(result["result_path"], result["summary_path"])

M	README.md	\|	4	+++-
M	lazier/api/routes.py	\|	29	+++++++++++++++++++++++------
M	lazier/core/jobs.py	\|	12	+++++++++++-
M	lazier/core/processing.py	\|	4	++++
M	lazier/downloader.py	\|	58	+++++++++++++++++++++++++++++++++++++++-------------------
M	lazier/utils.py	\|	88	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
A	tests/test_export_filenames.py	\|	58	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	tests/test_jobs.py	\|	3	++-
A	tests/test_music_detection.py	\|	59	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	tests/test_processing.py	\|	9	+++++----

	lazier personal summarizer
	Log \| Files \| Refs \| README