commit b1738d8d282ca60a93ad65b88b9f0959537ad129
parent 30c86b97c78ea073da753b26ce14c3aa9ddc89bc
Author: Pablo Murad <pblmrd@gmail.com>
Date: Thu, 21 May 2026 21:28:55 -0300
ESMOL CHANGES
Diffstat:
10 files changed, 291 insertions(+), 33 deletions(-)
diff --git a/README.md b/README.md
@@ -174,6 +174,8 @@ Para áudio/vídeo, quando `OPENAI_ENABLE_CHAPTERS=true`, o Lazier:
Os capítulos aparecem nos exports DOCX/Markdown/PDF/TXT e no JSON cru.
+Os ficheiros exportados usam o **título do vídeo ou do conteúdo** (sanitizado) como nome e como cabeçalho do documento. Quando há transcrição e sumário em ficheiros separados, os sufixos `- transcricao` e `- sumario` distinguem cada artefato.
+
## Sites suportados (vídeo/áudio)
Além do YouTube, você pode colar URLs de vídeo ou áudio de **centenas de sites**. O Lazier usa o [yt-dlp](https://github.com/yt-dlp/yt-dlp) para extrair o áudio; se a URL não for um vídeo, o sistema tenta extrair o texto da página e sumarizar.
@@ -198,7 +200,7 @@ Além do YouTube, você pode colar URLs de vídeo ou áudio de **centenas de sit
E muitos outros. **Lista completa** mantida pelo yt-dlp: [Supported sites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md).
-**Importante:** Conteúdo detectado como **música** (ex.: categoria Music no YouTube, domínios só de música) **não é processado** pelo Lazier.
+**Importante:** Conteúdo cujo **tema é música** (categoria Music no YouTube, tags de clipe oficial, domínios só de música) **não é processado** pelo Lazier. Vídeos com **música de fundo** (vlogs, palestras, tutoriais) continuam sendo processados normalmente.
## Docker
diff --git a/lazier/api/routes.py b/lazier/api/routes.py
@@ -17,11 +17,11 @@ from pydantic import BaseModel
from ..audio_processor import extract_audio_from_video
from ..core.formats import export
-from ..core.jobs import build_job_artifact_path, get_job_store, lazier_download_filename
+from ..core.jobs import build_job_artifact_path, get_job_store
from ..core.processing import process_source
from ..core.supported_sites import SUPPORTED_VIDEO_SITES
from ..core.playlist import is_playlist_url
-from ..utils import VIDEO_EXTENSIONS, is_upload_extension_allowed
+from ..utils import VIDEO_EXTENSIONS, build_export_filename, is_upload_extension_allowed
from .websocket import broadcast_progress
logger = logging.getLogger(__name__)
@@ -72,6 +72,15 @@ def _resolve_mode(
return "process"
+def _download_filename(job: dict, artifact_kind: str) -> str:
+ return build_export_filename(
+ job.get("metadata", {}),
+ job.get("format", "docx"),
+ source_name=job.get("source_name"),
+ artifact_kind=artifact_kind,
+ )
+
+
def _job_title(job: dict) -> str:
metadata = job.get("metadata", {})
if metadata.get("title"):
@@ -236,6 +245,7 @@ def _ensure_download_file(job: dict, artifact_kind: str) -> Optional[str]:
format_type=job.get("format", "docx"),
artifact_kind="transcription",
created_at=job.get("created_at"),
+ metadata=job.get("metadata", {}),
)
export(
transcription=job["transcription"],
@@ -254,6 +264,7 @@ def _ensure_download_file(job: dict, artifact_kind: str) -> Optional[str]:
format_type=job.get("format", "docx"),
artifact_kind="summary",
created_at=job.get("created_at"),
+ metadata=job.get("metadata", {}),
)
export(
transcription="",
@@ -453,7 +464,7 @@ async def download_transcription(job_id: str):
if not download_path:
raise HTTPException(status_code=404, detail="Transcricao nao disponivel")
- filename = Path(download_path).name
+ filename = _download_filename(job, "transcription")
return FileResponse(download_path, media_type="application/octet-stream", filename=filename)
@@ -469,7 +480,7 @@ async def download_summary(job_id: str):
if not download_path:
raise HTTPException(status_code=404, detail="Sumario nao disponivel")
- filename = Path(download_path).name
+ filename = _download_filename(job, "summary")
return FileResponse(download_path, media_type="application/octet-stream", filename=filename)
@@ -485,7 +496,8 @@ async def download_result(job_id: str):
if not download_path:
raise HTTPException(status_code=404, detail="Arquivo de resultado nao encontrado")
- filename = Path(download_path).name
+ artifact_kind = "transcription" if job.get("mode") == "transcribe" else "summary"
+ filename = _download_filename(job, artifact_kind)
return FileResponse(download_path, media_type="application/octet-stream", filename=filename)
@@ -510,7 +522,12 @@ async def download_bundle(job_id: str, background_tasks: BackgroundTasks):
tmp_path.unlink(missing_ok=True)
raise
- zip_filename = lazier_download_filename("zip")
+ zip_filename = build_export_filename(
+ job.get("metadata", {}),
+ "zip",
+ source_name=job.get("source_name"),
+ artifact_kind="result",
+ )
background_tasks.add_task(_unlink_quiet, str(tmp_path))
return FileResponse(str(tmp_path), media_type="application/zip", filename=zip_filename)
diff --git a/lazier/core/jobs.py b/lazier/core/jobs.py
@@ -13,6 +13,8 @@ from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
+from ..utils import build_export_filename
+
TERMINAL_STATUSES = {"completed", "failed", "interrupted"}
ARTIFACT_KINDS = frozenset({"transcription", "summary", "result"})
@@ -75,6 +77,7 @@ def build_job_artifact_path(
artifact_kind: str,
created_at: Optional[Union[str, datetime]] = None,
output_root: Optional[Path] = None,
+ metadata: Optional[Dict[str, Any]] = None,
) -> Path:
if artifact_kind not in ARTIFACT_KINDS:
raise ValueError(f"Artefato nao suportado: {artifact_kind}")
@@ -86,7 +89,14 @@ def build_job_artifact_path(
output_root=output_root,
)
output_dir.mkdir(parents=True, exist_ok=True)
- return output_dir / lazier_download_filename(format_type)
+ filename = build_export_filename(
+ metadata,
+ format_type,
+ source_name=source_name,
+ artifact_kind=artifact_kind,
+ output_dir=output_dir,
+ )
+ return output_dir / filename
class JobStore:
diff --git a/lazier/core/processing.py b/lazier/core/processing.py
@@ -729,6 +729,7 @@ def process_source(
artifact_kind="transcription",
created_at=created_at,
output_root=output_root,
+ metadata=metadata,
)
sm_path = build_job_artifact_path(
job_id=run_id,
@@ -737,6 +738,7 @@ def process_source(
artifact_kind="summary",
created_at=created_at,
output_root=output_root,
+ metadata=metadata,
)
export(
transcription=portuguese_text or "",
@@ -763,6 +765,7 @@ def process_source(
artifact_kind="summary",
created_at=created_at,
output_root=output_root,
+ metadata=metadata,
)
exported_path = export(
transcription="",
@@ -781,6 +784,7 @@ def process_source(
artifact_kind="transcription",
created_at=created_at,
output_root=output_root,
+ metadata=metadata,
)
exported_path = export(
transcription=portuguese_text or "",
diff --git a/lazier/downloader.py b/lazier/downloader.py
@@ -38,10 +38,13 @@ MUSIC_ONLY_DOMAINS = frozenset({
'jiosaavn.com', 'gaana.com', 'wynk.in', 'hungama.com', 'yandexmusic.ru',
})
-# Valores de category/genre/tags que indicam conteúdo de música
-MUSIC_CATEGORY_KEYWORDS = frozenset({
- 'music', 'música', 'musica', 'music video', 'music video clip',
- 'song', 'canção', 'cancao', 'album', 'single', 'mv', 'clip',
+# Categoria/gênero YouTube (ou equivalente) que indica conteúdo cujo tema é música
+MUSIC_CATEGORY_EXACT = frozenset({'music', 'música', 'musica'})
+
+# Frases em tags que indicam clipe/áudio oficial (não música de fundo em vlogs)
+MUSIC_TAG_PHRASES = frozenset({
+ 'music video', 'music video clip', 'official music video',
+ 'official audio', 'lyrics video', 'lyric video',
})
@@ -84,30 +87,47 @@ def is_music_domain(url: str) -> bool:
return False
+def _normalize_category_list(info: Dict[str, Any]) -> list[str]:
+ """Extrai lista de categorias a partir dos metadados do yt-dlp."""
+ categories = info.get('categories') or []
+ if isinstance(categories, str):
+ categories = [categories]
+ elif not isinstance(categories, list):
+ categories = []
+ if not categories and info.get('category'):
+ cat = info['category']
+ categories = [cat] if isinstance(cat, str) else list(cat) if cat else []
+ return [str(c).strip().lower() for c in categories if c]
+
+
def is_music_content(info: Dict[str, Any]) -> bool:
"""
- Verifica se os metadados do extract_info indicam conteúdo de música.
+ Verifica se os metadados do extract_info indicam conteúdo cujo tema é música.
Usado após extract_info (yt-dlp) para recusar processamento de música.
+
+ Vídeos com trilha de fundo (tags como "background music") não são bloqueados;
+ apenas categoria Music/Música, gênero equivalente ou tags de clipe oficial.
"""
if not info:
return False
- category = (info.get('categories') or []) if isinstance(info.get('categories'), list) else []
- if not category and info.get('category'):
- category = [info['category']]
- genre = (info.get('genre') or '') if isinstance(info.get('genre'), str) else ''
+
+ for cat in _normalize_category_list(info):
+ if cat in MUSIC_CATEGORY_EXACT:
+ return True
+
+ genre = info.get('genre') or ''
+ if isinstance(genre, str) and genre.strip().lower() in MUSIC_CATEGORY_EXACT:
+ return True
+
tags = info.get('tags') or []
if isinstance(tags, str):
tags = [tags]
- # YouTube: category id ou category string
- yt_category = (info.get('categories') or [info.get('category')] if info.get('category') else [])
- if isinstance(yt_category, str):
- yt_category = [yt_category]
- combined = ' '.join(
- str(x).lower() for x in
- (category + [genre] + list(tags)[:20] + list(yt_category))
- if x
- )
- return any(kw in combined for kw in MUSIC_CATEGORY_KEYWORDS)
+ for tag in list(tags)[:20]:
+ tag_lower = str(tag).lower()
+ if any(phrase in tag_lower for phrase in MUSIC_TAG_PHRASES):
+ return True
+
+ return False
def _classify_youtube_error(error: Exception) -> Tuple[Type[YouTubeDownloadError], bool]:
diff --git a/lazier/utils.py b/lazier/utils.py
@@ -4,12 +4,13 @@ Utilitários para validação, limpeza e verificação de dependências
import os
import re
+import secrets
import shutil
import threading
import time
import unicodedata
from pathlib import Path
-from typing import Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
from urllib.parse import urlparse
@@ -156,6 +157,91 @@ def sanitize_filename(filename: str) -> str:
return filename.strip()
+_EMPTY_TITLE_VALUES = frozenset({"", "sem título", "sem titulo"})
+
+_ARTIFACT_FILENAME_SUFFIX = {
+ "transcription": " - transcricao",
+ "summary": " - sumario",
+ "result": "",
+}
+
+
+def _title_usable_for_filename(
+ metadata: Optional[Dict[str, Any]] = None,
+ source_name: Optional[str] = None,
+) -> bool:
+ meta = metadata or {}
+ title = meta.get("title")
+ if title and str(title).strip().lower() not in _EMPTY_TITLE_VALUES:
+ return True
+ if source_name and not str(source_name).startswith(("http://", "https://")):
+ if Path(source_name).stem:
+ return True
+ return False
+
+
+def resolve_export_title(
+ metadata: Optional[Dict[str, Any]] = None,
+ source_name: Optional[str] = None,
+ fallback: str = "Transcricao",
+) -> str:
+ """Titulo para documento e base do nome de ficheiro exportado."""
+ meta = metadata or {}
+ title = meta.get("title")
+ if title and str(title).strip().lower() not in _EMPTY_TITLE_VALUES:
+ return str(title).strip()
+ if source_name and not str(source_name).startswith(("http://", "https://")):
+ stem = Path(source_name).stem
+ if stem:
+ return stem
+ return fallback
+
+
+def _lazier_fallback_filename(format_type: str, artifact_suffix: str = "") -> str:
+ n = secrets.randbelow(900_000) + 100_000
+ ext = format_type.lstrip(".")
+ return f"lazier_{n}{artifact_suffix}.{ext}"
+
+
+def build_export_filename(
+ metadata: Optional[Dict[str, Any]] = None,
+ format_type: str = "docx",
+ *,
+ source_name: Optional[str] = None,
+ artifact_kind: Optional[str] = None,
+ output_dir: Optional[Path] = None,
+) -> str:
+ """
+ Nome de ficheiro para exportacao baseado no titulo do video/conteudo.
+ Usa fallback lazier_<6 digitos> quando nao ha titulo utilizavel.
+ """
+ ext = format_type.lstrip(".")
+ artifact_suffix = _ARTIFACT_FILENAME_SUFFIX.get(artifact_kind or "result", "")
+
+ if _title_usable_for_filename(metadata, source_name):
+ base = sanitize_filename(resolve_export_title(metadata, source_name))
+ name = f"{base}{artifact_suffix}.{ext}"
+ else:
+ name = _lazier_fallback_filename(format_type, artifact_suffix)
+
+ if output_dir is None:
+ return name
+
+ out = Path(output_dir)
+ path = out / name
+ if not path.exists():
+ return name
+
+ stem = path.stem
+ suffix = path.suffix
+ counter = 2
+ while True:
+ candidate = f"{stem}-{counter}{suffix}"
+ if not (out / candidate).exists():
+ return candidate
+ counter += 1
+
+
def sanitize_xml_string(text: str) -> str:
"""
Remove caracteres inválidos para XML, mantendo apenas caracteres válidos
diff --git a/tests/test_export_filenames.py b/tests/test_export_filenames.py
@@ -0,0 +1,58 @@
+import os
+import unittest
+from pathlib import Path
+
+from lazier.utils import (
+ build_export_filename,
+ resolve_export_title,
+ sanitize_filename,
+)
+
+
+class ExportFilenameTests(unittest.TestCase):
+ def setUp(self):
+ self.temp_dir = Path(os.getcwd()) / ".tmp-tests" / "export-filenames"
+ self.temp_dir.mkdir(parents=True, exist_ok=True)
+
+ def tearDown(self):
+ for f in self.temp_dir.glob("*"):
+ f.unlink(missing_ok=True)
+
+ def test_resolve_title_from_metadata(self):
+ title = resolve_export_title({"title": "Meu Video Legal"}, source_name="https://youtube.com/watch?v=x")
+ self.assertEqual(title, "Meu Video Legal")
+
+ def test_resolve_title_from_source_name_when_no_metadata_title(self):
+ title = resolve_export_title({}, source_name="palestra.mp3")
+ self.assertEqual(title, "palestra")
+
+ def test_video_title_becomes_docx_filename(self):
+ name = build_export_filename(
+ {"title": "Meu Video"},
+ "docx",
+ source_name="https://youtube.com/watch?v=abc",
+ )
+ self.assertEqual(name, f"{sanitize_filename('Meu Video')}.docx")
+
+ def test_dual_artifacts_use_transcricao_and_sumario_suffixes(self):
+ meta = {"title": "Palestra IA"}
+ tx = build_export_filename(meta, "docx", artifact_kind="transcription")
+ sm = build_export_filename(meta, "docx", artifact_kind="summary")
+ self.assertTrue(tx.endswith(" - transcricao.docx"))
+ self.assertTrue(sm.endswith(" - sumario.docx"))
+ self.assertNotEqual(tx, sm)
+
+ def test_collision_appends_numeric_suffix(self):
+ meta = {"title": "Duplicado"}
+ first = build_export_filename(meta, "txt", output_dir=self.temp_dir)
+ (self.temp_dir / first).write_text("x", encoding="utf-8")
+ second = build_export_filename(meta, "txt", output_dir=self.temp_dir)
+ self.assertEqual(second, f"{sanitize_filename('Duplicado')}-2.txt")
+
+ def test_no_title_falls_back_to_lazier_pattern(self):
+ name = build_export_filename({}, "docx", source_name="https://example.com/v")
+ self.assertRegex(name, r"^lazier_\d{6}\.docx$")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/test_jobs.py b/tests/test_jobs.py
@@ -55,10 +55,11 @@ class JobStoreTests(unittest.TestCase):
artifact_kind="summary",
created_at=created_at,
output_root=self.temp_dir,
+ metadata={"title": "Minha Reunião"},
)
self.assertIn("2026", str(output_dir))
self.assertIn("03", str(output_dir))
self.assertIn("31", str(output_dir))
self.assertTrue(output_dir.name.startswith("minha-reuniao-abc12345"))
- self.assertRegex(summary_path.name, r"^lazier_\d{6}\.txt$")
+ self.assertEqual(summary_path.name, "Minha Reunião - sumario.txt")
diff --git a/tests/test_music_detection.py b/tests/test_music_detection.py
@@ -0,0 +1,59 @@
+import unittest
+
+from lazier.downloader import is_music_content, is_music_domain
+
+
+class MusicDetectionTests(unittest.TestCase):
+ def test_vlog_with_background_music_tag_not_blocked(self):
+ info = {
+ 'category': 'Education',
+ 'tags': ['background music', 'vlog', 'tutorial'],
+ }
+ self.assertFalse(is_music_content(info))
+
+ def test_podcast_category_not_blocked(self):
+ info = {
+ 'category': 'People & Blogs',
+ 'tags': ['podcast', 'interview'],
+ }
+ self.assertFalse(is_music_content(info))
+
+ def test_music_category_blocked(self):
+ info = {'category': 'Music', 'tags': ['pop', '2024']}
+ self.assertTrue(is_music_content(info))
+
+ def test_music_category_lowercase_blocked(self):
+ info = {'categories': ['music']}
+ self.assertTrue(is_music_content(info))
+
+ def test_official_music_video_tag_blocked(self):
+ info = {
+ 'category': 'Entertainment',
+ 'tags': ['official music video', 'artist name'],
+ }
+ self.assertTrue(is_music_content(info))
+
+ def test_lyrics_video_tag_blocked(self):
+ info = {
+ 'category': 'Entertainment',
+ 'tags': ['lyrics video'],
+ }
+ self.assertTrue(is_music_content(info))
+
+ def test_genre_music_blocked(self):
+ info = {'category': 'Entertainment', 'genre': 'Música'}
+ self.assertTrue(is_music_content(info))
+
+ def test_empty_info_not_blocked(self):
+ self.assertFalse(is_music_content({}))
+ self.assertFalse(is_music_content(None))
+
+ def test_spotify_domain_blocked(self):
+ self.assertTrue(is_music_domain('https://open.spotify.com/track/abc'))
+
+ def test_youtube_domain_not_blocked_by_domain_check(self):
+ self.assertFalse(is_music_domain('https://www.youtube.com/watch?v=abc'))
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/test_processing.py b/tests/test_processing.py
@@ -58,8 +58,8 @@ class ProcessingTests(unittest.TestCase):
self.assertEqual(result["content_type"], "podcast")
self.assertIsNotNone(result["transcription_path"])
self.assertIsNotNone(result["summary_path"])
- self.assertRegex(Path(result["transcription_path"]).name, r"^lazier_\d{6}\.txt$")
- self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.txt$")
+ self.assertEqual(Path(result["transcription_path"]).name, "sample - transcricao.txt")
+ self.assertEqual(Path(result["summary_path"]).name, "sample - sumario.txt")
self.assertEqual(result["result_path"], result["summary_path"])
self.assertTrue(Path(result["transcription_path"]).exists())
self.assertTrue(Path(result["summary_path"]).exists())
@@ -97,7 +97,8 @@ class ProcessingTests(unittest.TestCase):
self.assertIsNone(result["smart_summary"])
self.assertIsNotNone(result["transcription_path"])
self.assertIsNotNone(result["summary_path"])
- self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.txt$")
+ self.assertEqual(Path(result["summary_path"]).name, "article - sumario.txt")
+ self.assertEqual(Path(result["transcription_path"]).name, "article - transcricao.txt")
self.assertEqual(result["result_path"], result["summary_path"])
self.assertTrue(Path(result["transcription_path"]).exists())
self.assertTrue(Path(result["summary_path"]).exists())
@@ -148,7 +149,7 @@ class ProcessingTests(unittest.TestCase):
self.assertIn("Resumo curto", result["summary"])
self.assertIsNotNone(result["transcription_path"])
self.assertIsNotNone(result["summary_path"])
- self.assertRegex(Path(result["summary_path"]).name, r"^lazier_\d{6}\.md$")
+ self.assertEqual(Path(result["summary_path"]).name, "smart - sumario.md")
self.assertEqual(result["result_path"], result["summary_path"])