changes - lazier - personal summarizer

commit 2fc19a484e50d16e7342e3105337a409663a9269
parent 3d8a93b1024e657723eff0ef1e9aeeb1dbbbe5e6
Author: Pablo Murad <pablo@pablomurad.com>
Date:   Sat, 31 Jan 2026 15:12:26 -0300

changes

Diffstat:
A .dockerignore  | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M README.md  | 26 +++++++++++++++++++++++++-
M lazier/api/routes.py  | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
M lazier/cli.py  | 51 +++++++++++++++++++++++++++++++++++++++++++++++----
M lazier/core/exceptions.py  | 5 +++++
M lazier/core/formats.py  | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
M lazier/core/playlist.py  | 2 +-
A lazier/core/supported_sites.py  | 199 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M lazier/downloader.py  | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M lazier/web/templates/index.html  | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M pyproject.toml  | 1 +
M requirements.txt  | 1 +

12 files changed, 829 insertions(+), 17 deletions(-)
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,61 @@
+# Não incluir .env na imagem (usado via env_file no compose)
+.env
+.env.*
+!.env.example
+
+# Pasta Docker (não precisa dentro da imagem)
+docker/
+
+# Git e IDE
+.git/
+.gitignore
+.cursor/
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Ambientes virtuais
+venv/
+.venv/
+ENV/
+env/
+
+# Outputs e dados locais (montados por volume no compose)
+outputs/
+downloads/
+*.tmp
+*.temp
+*_audio.*
+*.docx
+
+# OS e logs
+.DS_Store
+Thumbs.db
+*.log
+
+# Testes (opcional; descomente se quiser excluir)
+# tests/
diff --git a/README.md b/README.md
@@ -43,7 +43,7 @@ lazier document.pdf
 lazier "https://www.youtube.com/watch?v=VIDEO_ID"
 
 # Opções
-lazier audio.mp3 --format json          # Formato: docx, txt, md, json
+lazier audio.mp3 --format json          # Formato: docx, txt, md, json, pdf
 lazier transcribe video.mp4             # Apenas transcrição
 lazier web                               # Inicia servidor web
 lazier cache clear                       # Limpa cache
@@ -53,7 +53,31 @@ lazier cache clear                       # Limpa cache
 
 Acesse http://localhost:19283 após iniciar com `lazier web` ou Docker.
 
+## Sites suportados (vídeo/áudio)
 
+Além do YouTube, você pode colar URLs de vídeo ou áudio de **centenas de sites**. O Lazier usa o [yt-dlp](https://github.com/yt-dlp/yt-dlp) para extrair o áudio; se a URL não for um vídeo, o sistema tenta extrair o texto da página e sumarizar.
+
+**Exemplos de sites que você pode processar:**
+
+| | | |
+|---|---|---|
+| YouTube | TED | Reddit |
+| Vimeo | Twitter / X | TikTok |
+| Instagram | Facebook | Twitch |
+| Dailymotion | BBC, CNN, NBC | NPR, PBS |
+| Arte, France TV, RTVE | Khan Academy | Coursera, Udemy |
+| LinkedIn Learning | Loom | Streamable |
+| BitChute, Odysee | Rumble | PeerTube |
+| archive.org | Patreon | Substack |
+| Wistia | Niconico | Bilibili |
+| Kick, Floatplane | Nebula | CuriosityStream |
+| C-SPAN | Al Jazeera | DW, Reuters |
+| ESPN, Fox Sports | Formula 1 | Olympics |
+| NYTimes | Washington Post | The Guardian |
+
+E muitos outros. **Lista completa** mantida pelo yt-dlp: [Supported sites](https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md).
+
+**Importante:** Conteúdo detectado como **música** (ex.: categoria Music no YouTube, domínios só de música) **não é processado** pelo Lazier.
 
 ## Docker
 
diff --git a/lazier/api/routes.py b/lazier/api/routes.py
@@ -19,10 +19,12 @@ from ..core.formats import export
 from ..core.exceptions import (
     YouTubeDownloadError,
     YouTubeVideoUnavailableError,
-    YouTubeAccessDeniedError
+    YouTubeAccessDeniedError,
+    MusicContentError,
 )
+from ..core.supported_sites import SUPPORTED_VIDEO_SITES
 from ..utils import validate_input, get_output_filename, is_youtube_url, get_lazier_filename
-from ..downloader import download_youtube_audio
+from ..downloader import download_youtube_audio, download_video_audio
 from ..audio_processor import prepare_audio_file, extract_audio_from_video
 from ..transcriber import transcribe_audio
 from ..summarizer import summarize_text, summarize_text_file, summarize_web_page, summarize_pdf
@@ -364,11 +366,10 @@ async def process_url(request: ProcessRequest, background_tasks: BackgroundTasks
     
     # Processa em background
     if is_youtube_url(request.url):
-        # Processa YouTube
         background_tasks.add_task(process_youtube_async, request.url, job_id, request.format, request.transcribe, request.summarize)
     else:
-        # Processa página web
-        background_tasks.add_task(process_web_async, request.url, job_id, request.format, request.transcribe, request.summarize)
+        # Tenta processar como vídeo de qualquer site; se falhar, fallback para página web
+        background_tasks.add_task(process_video_url_async, request.url, job_id, request.format, request.transcribe, request.summarize)
     
     return {"job_id": job_id, "status": "processing"}
 
@@ -419,6 +420,13 @@ def process_youtube_async(url: str, job_id: str, output_format: str, should_tran
                 broadcast_progress(job_id, 20, 'processing', 'Baixando vídeo do YouTube...')
                 try:
                     audio_file, metadata = download_youtube_audio(url, str(UPLOAD_DIR))
+                except MusicContentError as e:
+                    logger.error(f"Conteúdo detectado como música (job {job_id}): {str(e)}")
+                    user_message = "Conteúdo detectado como música não é processado pelo Lazier."
+                    jobs[job_id]['status'] = 'failed'
+                    jobs[job_id]['error'] = user_message
+                    broadcast_progress(job_id, 0, 'failed', user_message)
+                    return
                 except YouTubeVideoUnavailableError as e:
                     logger.error(f"Vídeo não disponível (job {job_id}): {str(e)}")
                     user_message = (
@@ -581,6 +589,114 @@ def process_youtube_async(url: str, job_id: str, output_format: str, should_tran
             broadcast_progress(job_id, 0, 'failed', f'Erro: {error_msg}')
 
 
+def process_video_url_async(url: str, job_id: str, output_format: str, should_transcribe: bool, should_summarize: bool):
+    """Tenta processar URL como vídeo (TED, Reddit, Vimeo, etc.); se falhar, fallback para página web."""
+    try:
+        broadcast_progress(job_id, 10, 'processing', 'Tentando extrair vídeo/áudio da URL...')
+        try:
+            audio_file, metadata = download_video_audio(url, str(UPLOAD_DIR))
+        except MusicContentError as e:
+            logger.error(f"Conteúdo detectado como música (job {job_id}): {str(e)}")
+            user_message = "Conteúdo detectado como música não é processado pelo Lazier."
+            jobs[job_id]['status'] = 'failed'
+            jobs[job_id]['error'] = user_message
+            broadcast_progress(job_id, 0, 'failed', user_message)
+            return
+        except Exception as e:
+            logger.info(f"URL não é vídeo ou falha ao baixar (job {job_id}), fallback para página web: {e}")
+            process_web_async(url, job_id, output_format, should_transcribe, should_summarize)
+            return
+        _run_video_pipeline(
+            job_id=job_id,
+            audio_file=audio_file,
+            metadata=metadata,
+            output_format=output_format,
+            should_transcribe=should_transcribe,
+            should_summarize=should_summarize,
+            cache_prefix='video',
+            url_hash=calculate_url_hash(url),
+        )
+    except Exception as e:
+        if jobs[job_id].get('status') != 'failed':
+            jobs[job_id]['status'] = 'failed'
+            jobs[job_id]['error'] = str(e)
+            broadcast_progress(job_id, 0, 'failed', str(e))
+
+
+def _run_video_pipeline(
+    job_id: str,
+    audio_file: str,
+    metadata: dict,
+    output_format: str,
+    should_transcribe: bool,
+    should_summarize: bool,
+    cache_prefix: str,
+    url_hash: str,
+):
+    """Pipeline comum: transcrever, sumarizar, exportar, cache. Usado por YouTube e vídeo genérico."""
+    if 'transcription' not in jobs[job_id]:
+        jobs[job_id]['transcription'] = None
+        jobs[job_id]['summary'] = None
+        jobs[job_id]['transcription_path'] = None
+        jobs[job_id]['summary_path'] = None
+        jobs[job_id]['metadata'] = {}
+    cache = get_cache_manager()
+    cached = cache.get(cache_prefix, url_hash) if cache else None
+    transcription = None
+    summary = None
+    transcription_path = None
+    summary_path = None
+    transcription_internal = None
+    needs_transcription = should_transcribe or should_summarize
+    if cached:
+        transcription_internal = cached.get('transcription')
+        summary = cached.get('summary')
+        metadata = cached.get('metadata', metadata)
+        if should_transcribe:
+            transcription = transcription_internal
+        if transcription_internal and (not should_summarize or summary):
+            jobs[job_id]['progress'] = 100
+            broadcast_progress(job_id, 100, 'completed', 'Dados encontrados no cache')
+    else:
+        if needs_transcription:
+            broadcast_progress(job_id, 30, 'processing', 'Transcrevendo áudio...')
+            transcription_internal = transcribe_audio(audio_file, language='pt', model='whisper-1')
+            broadcast_progress(job_id, 60, 'processing', 'Transcrição concluída')
+            if should_transcribe:
+                transcription = transcription_internal
+            if transcription and should_transcribe and not should_summarize:
+                transcription_path = Path(get_lazier_filename(OUTPUT_DIR, output_format, "_transcription"))
+                export(transcription=transcription, summary=None, metadata=metadata, output_path=str(transcription_path), format_type=output_format)
+        if should_summarize and transcription_internal:
+            broadcast_progress(job_id, 70, 'processing', 'Gerando sumário...')
+            summary = summarize_text(transcription_internal, model='gpt-4o-mini', language='pt-BR')
+            broadcast_progress(job_id, 80, 'processing', 'Sumário concluído')
+            if summary and not should_transcribe:
+                summary_path = Path(get_lazier_filename(OUTPUT_DIR, output_format, "_summary"))
+                export(transcription="", summary=summary, metadata=metadata, output_path=str(summary_path), format_type=output_format)
+        if cache and transcription_internal:
+            cache.set(cache_prefix, url_hash, {
+                'transcription': transcription_internal,
+                'summary': summary,
+                'metadata': metadata,
+                'timestamp': datetime.now().isoformat(),
+            })
+    should_generate_consolidated = (should_transcribe and should_summarize) or (should_transcribe and not should_summarize and not transcription_path)
+    if should_generate_consolidated:
+        broadcast_progress(job_id, 90, 'processing', 'Gerando arquivo de saída...')
+        output_path = Path(get_lazier_filename(OUTPUT_DIR, output_format))
+        export(transcription=transcription or "", summary=summary if should_summarize else None, metadata=metadata, output_path=str(output_path), format_type=output_format)
+        jobs[job_id]['result_path'] = str(output_path)
+    jobs[job_id]['transcription'] = transcription if should_transcribe else None
+    jobs[job_id]['summary'] = summary
+    jobs[job_id]['transcription_path'] = str(transcription_path) if transcription_path else None
+    jobs[job_id]['summary_path'] = str(summary_path) if summary_path else None
+    jobs[job_id]['metadata'] = metadata
+    jobs[job_id]['status'] = 'completed'
+    jobs[job_id]['progress'] = 100
+    broadcast_progress(job_id, 100, 'completed', 'Processamento concluído')
+
+
 def process_web_async(url: str, job_id: str, output_format: str, should_transcribe: bool, should_summarize: bool):
     """Processa página web"""
     try:
@@ -1007,3 +1123,9 @@ async def clear_cache():
         count = cache.clear_all()
         return {"message": f"Cache limpo: {count} chaves removidas"}
     return {"message": "Cache não disponível"}
+
+
+@router.get("/supported-sites")
+async def get_supported_sites():
+    """Retorna lista de sites que podem ser processados (vídeo/áudio via yt-dlp)"""
+    return {"sites": SUPPORTED_VIDEO_SITES}
diff --git a/lazier/cli.py b/lazier/cli.py
@@ -19,13 +19,15 @@ from .utils import (
     get_output_filename,
     check_ffmpeg
 )
-from .downloader import download_youtube_audio
+from .downloader import download_youtube_audio, download_video_audio
 from .audio_processor import prepare_audio_file
 from .transcriber import transcribe_audio
 from .summarizer import summarize_text, summarize_text_file, summarize_web_page, summarize_pdf
 from .core.formats import export
 from .core.playlist import is_playlist_url, process_playlist
 from .core.cache import get_cache_manager, calculate_file_hash, calculate_url_hash
+from .core.exceptions import MusicContentError
+from .web.extractor import extract_pdf_content, extract_text_file_content
 
 load_dotenv()
 
@@ -37,7 +39,7 @@ console = Console()
 @click.version_option(version='0.01', prog_name='lazier')
 @click.argument('input_path', required=False)
 @click.option('--output', '-o', type=str, help='Nome do arquivo de saída')
-@click.option('--format', '-f', type=click.Choice(['docx', 'txt', 'md', 'json']), default='docx', help='Formato de saída (padrão: docx)')
+@click.option('--format', '-f', type=click.Choice(['docx', 'txt', 'md', 'json', 'pdf']), default='docx', help='Formato de saída (padrão: docx)')
 @click.option('--language', '-l', default='pt', help='Idioma para transcrição (padrão: pt)')
 @click.option('--model', default='whisper-1', help='Modelo Whisper (padrão: whisper-1)')
 @click.option('--gpt-model', default='gpt-4o-mini', help='Modelo GPT para sumarização (padrão: gpt-4o-mini)')
@@ -69,7 +71,7 @@ def cli(ctx, input_path, output, format, language, model, gpt_model, keep_files,
 @cli.command()
 @click.argument('input_path', type=str)
 @click.option('--output', '-o', type=str, help='Nome do arquivo de saída')
-@click.option('--format', '-f', type=click.Choice(['docx', 'txt', 'md', 'json']), default='docx', help='Formato de saída')
+@click.option('--format', '-f', type=click.Choice(['docx', 'txt', 'md', 'json', 'pdf']), default='docx', help='Formato de saída')
 @click.option('--language', '-l', default='pt', help='Idioma para transcrição (padrão: pt)')
 @click.option('--model', default='whisper-1', help='Modelo Whisper (padrão: whisper-1)')
 @click.option('--keep-files', is_flag=True, help='Não deletar arquivos temporários')
@@ -154,6 +156,7 @@ def process_input(
         metadata = {}
         transcription = None
         summary = None
+        web_video_url_hash = None
         
         try:
             # Verifica se é playlist
@@ -206,7 +209,35 @@ def process_input(
                     files_to_cleanup.append(audio_file)
                 progress.update(task3, completed=100)
             
-            # Processa texto/PDF/web
+            elif input_type == 'web':
+                progress.update(task3, description="[cyan]Tentando extrair vídeo/áudio da URL...")
+                web_video_url_hash = calculate_url_hash(input_path) if cache else None
+                try:
+                    audio_file, metadata = download_video_audio(input_path)
+                    files_to_cleanup.append(audio_file)
+                    if cache and web_video_url_hash:
+                        cached = cache.get('video', web_video_url_hash)
+                        if cached:
+                            transcription = cached.get('transcription')
+                            summary = cached.get('summary') if should_summarize else None
+                            metadata = cached.get('metadata', {})
+                            console.print("[green]✓[/green] Usando cache")
+                            audio_file = None
+                    progress.update(task3, completed=100)
+                except MusicContentError:
+                    console.print("[red]Conteúdo detectado como música não é processado pelo Lazier.[/red]")
+                    cleanup_files(files_to_cleanup)
+                    sys.exit(1)
+                except Exception:
+                    progress.update(task3, description="[cyan]Extraindo texto da página web...")
+                    content_data = extract_web_content(input_path)
+                    metadata = {'title': content_data.get('title', 'Página Web'), 'file_path': input_path}
+                    transcription = content_data['content']
+                    summary = summarize_web_page(input_path, model=gpt_model, language='pt-BR') if should_summarize else None
+                    progress.update(task3, completed=100)
+                    audio_file = None
+            
+            # Processa texto/PDF/web (arquivos locais)
             elif Path(input_path).suffix.lower() == '.pdf':
                 progress.update(task3, description="[cyan]Extraindo texto do PDF...")
                 content_data = extract_pdf_content(input_path)
@@ -285,6 +316,14 @@ def process_input(
                     transcription = content_data['content']
                     progress.update(task5, completed=100)
             
+            if web_video_url_hash and cache and transcription:
+                cache.set('video', web_video_url_hash, {
+                    'transcription': transcription,
+                    'summary': summary,
+                    'metadata': metadata,
+                    'timestamp': datetime.now().isoformat(),
+                })
+            
             # Gera arquivo de saída
             task6 = progress.add_task(f"[blue]Gerando arquivo {format_type.upper()}...", total=100)
             
@@ -312,6 +351,10 @@ def process_input(
             if not keep_files:
                 cleanup_files(files_to_cleanup)
         
+        except MusicContentError:
+            console.print("[red]Conteúdo detectado como música não é processado pelo Lazier.[/red]")
+            cleanup_files(files_to_cleanup)
+            sys.exit(1)
         except KeyboardInterrupt:
             console.print("\n[yellow]Operação cancelada pelo usuário.[/yellow]")
             cleanup_files(files_to_cleanup)
diff --git a/lazier/core/exceptions.py b/lazier/core/exceptions.py
@@ -32,3 +32,8 @@ class YouTubeVideoUnavailableError(YouTubeDownloadError):
 class YouTubeAccessDeniedError(YouTubeDownloadError):
     """Acesso negado ao vídeo (403, bloqueio)"""
     pass
+
+
+class MusicContentError(LazierException):
+    """Conteúdo detectado como música; não deve ser processado pelo Lazier"""
+    pass
diff --git a/lazier/core/formats.py b/lazier/core/formats.py
@@ -1,5 +1,5 @@
 """
-Exportadores de múltiplos formatos (TXT, Markdown, JSON, DOCX)
+Exportadores de múltiplos formatos (TXT, Markdown, JSON, DOCX, PDF)
 """
 
 import json
@@ -11,6 +11,7 @@ from docx.shared import Pt, Inches, RGBColor
 from docx.enum.text import WD_ALIGN_PARAGRAPH
 
 from ..docx_generator import _format_duration
+from ..utils import sanitize_xml_string
 
 
 def export_txt(
@@ -249,6 +250,94 @@ def export_docx(
     return create_document(transcription, summary, metadata, output_path)
 
 
+def export_pdf(
+    transcription: str,
+    summary: Optional[str] = None,
+    metadata: Optional[Dict[str, Any]] = None,
+    output_path: str = "output.pdf"
+) -> str:
+    """
+    Exporta transcrição e sumário em formato PDF.
+    
+    Args:
+        transcription: Texto transcrito
+        summary: Texto sumarizado (opcional)
+        metadata: Metadados do vídeo/áudio
+        output_path: Caminho do arquivo de saída
+    
+    Returns:
+        Caminho do arquivo criado
+    """
+    from reportlab.lib.pagesizes import A4
+    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+    from reportlab.lib.units import cm
+    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
+    
+    output_path_obj = Path(output_path)
+    output_path_obj.parent.mkdir(parents=True, exist_ok=True)
+    
+    doc = SimpleDocTemplate(
+        str(output_path_obj),
+        pagesize=A4,
+        rightMargin=2*cm,
+        leftMargin=2*cm,
+        topMargin=2*cm,
+        bottomMargin=2*cm,
+    )
+    styles = getSampleStyleSheet()
+    title_style = ParagraphStyle(
+        'CustomTitle',
+        parent=styles['Heading1'],
+        fontSize=16,
+        spaceAfter=12,
+        alignment=1,
+    )
+    heading_style = styles['Heading2']
+    body_style = styles['Normal']
+    
+    story = []
+    
+    title = sanitize_xml_string(
+        metadata.get('title', 'Transcrição') if metadata else 'Transcrição'
+    )
+    story.append(Paragraph(title, title_style))
+    story.append(Spacer(1, 12))
+    
+    meta_line = f"Data de processamento: {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}"
+    if metadata:
+        if metadata.get('duration'):
+            duration_str = _format_duration(metadata['duration'])
+            meta_line += f" | Duração: {duration_str}"
+        if metadata.get('uploader'):
+            meta_line += f" | Canal/Criador: {sanitize_xml_string(str(metadata['uploader']))}"
+        if metadata.get('webpage_url'):
+            meta_line += f" | URL: {sanitize_xml_string(str(metadata['webpage_url']))}"
+    story.append(Paragraph(sanitize_xml_string(meta_line), body_style))
+    story.append(Spacer(1, 16))
+    
+    if summary:
+        story.append(Paragraph("Sumário", heading_style))
+        story.append(Spacer(1, 8))
+        for para in summary.split('\n\n'):
+            if not para.strip():
+                continue
+            text = sanitize_xml_string(para.strip()).replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('\n', '<br/>')
+            story.append(Paragraph(text, body_style))
+        story.append(Spacer(1, 16))
+    
+    story.append(Paragraph("Transcrição Completa", heading_style))
+    story.append(Spacer(1, 8))
+    transcription_safe = sanitize_xml_string(transcription or "")
+    for para in (transcription_safe.split('\n\n') if transcription_safe else []):
+        if not para.strip():
+            continue
+        text = para.strip().replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('\n', '<br/>')
+        story.append(Paragraph(text, body_style))
+    
+    doc.build(story)
+    return str(output_path_obj)
+
+
 def export(
     transcription: str,
     summary: Optional[str] = None,
@@ -264,7 +353,7 @@ def export(
         summary: Texto sumarizado (opcional)
         metadata: Metadados do vídeo/áudio
         output_path: Caminho base do arquivo (sem extensão)
-        format_type: Tipo de formato (txt, md, json, docx)
+        format_type: Tipo de formato (txt, md, json, docx, pdf)
     
     Returns:
         Caminho do arquivo criado
@@ -284,5 +373,7 @@ def export(
         return export_json(transcription, summary, metadata, output_path)
     elif format_type == 'docx':
         return export_docx(transcription, summary, metadata, output_path)
+    elif format_type == 'pdf':
+        return export_pdf(transcription, summary, metadata, output_path)
     else:
-        raise ValueError(f"Formato não suportado: {format_type}. Use: txt, md, json, docx")
+        raise ValueError(f"Formato não suportado: {format_type}. Use: txt, md, json, docx, pdf")
diff --git a/lazier/core/playlist.py b/lazier/core/playlist.py
@@ -5,7 +5,7 @@ Processamento de playlists do YouTube
 import re
 from typing import List, Dict, Any, Optional
 import yt_dlp
-from .downloader import download_youtube_audio
+from ..downloader import download_youtube_audio
 
 
 def is_playlist_url(url: str) -> bool:
diff --git a/lazier/core/supported_sites.py b/lazier/core/supported_sites.py
@@ -0,0 +1,199 @@
+"""
+Lista curada de sites que o Lazier pode processar (vídeo/áudio via yt-dlp).
+Usada para exibição no README e na interface web.
+Lista completa mantida pelo yt-dlp: https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md
+"""
+
+SUPPORTED_VIDEO_SITES = [
+    "YouTube",
+    "TED",
+    "Reddit",
+    "Vimeo",
+    "Twitter / X",
+    "TikTok",
+    "Instagram",
+    "Facebook",
+    "Twitch",
+    "Dailymotion",
+    "BBC",
+    "BBC iPlayer",
+    "CNN",
+    "NBC",
+    "NPR",
+    "PBS",
+    "Arte",
+    "France TV",
+    "RTVE",
+    "RAI",
+    "ZDF",
+    "Khan Academy",
+    "Coursera",
+    "Udemy",
+    "LinkedIn Learning",
+    "Loom",
+    "Streamable",
+    "BitChute",
+    "Odysee",
+    "Rumble",
+    "PeerTube",
+    "archive.org",
+    "Mixcloud",
+    "Apple Podcasts",
+    "Patreon",
+    "Substack",
+    "Wistia",
+    "Imgur",
+    "Flickr",
+    "TwitCasting",
+    "Niconico",
+    "Bilibili",
+    "Douyin",
+    "Weibo",
+    "iQiyi",
+    "Youku",
+    "Naver",
+    "V LIVE",
+    "Kick",
+    "Floatplane",
+    "Nebula",
+    "CuriosityStream",
+    "Dropout",
+    "Rooster Teeth",
+    "GameSpot",
+    "IGN",
+    "Giant Bomb",
+    "GDC Vault",
+    "Microsoft Build",
+    "PyVideo",
+    "media.ccc.de",
+    "C-SPAN",
+    "Senate.gov",
+    "Parliament Live",
+    "Europarl",
+    "Bundestag",
+    "Al Jazeera",
+    "DW",
+    "Reuters",
+    "Bloomberg",
+    "CGTN",
+    "CNA",
+    "NHK",
+    "ABC (AU)",
+    "SBS",
+    "CBC",
+    "NBC Sports",
+    "ESPN",
+    "Fox Sports",
+    "MLB",
+    "NFL",
+    "FIFA",
+    "Formula 1",
+    "PGA Tour",
+    "Wimbledon",
+    "Olympics",
+    "Red Bull TV",
+    "GQ",
+    "Vogue",
+    "WIRED",
+    "The New Yorker",
+    "Conde Nast",
+    "NYTimes",
+    "Washington Post",
+    "The Guardian",
+    "HuffPost",
+    "BuzzFeed",
+    "Vox",
+    "Vice",
+    "TMZ",
+    "Hollywood Reporter",
+    "Slideshare",
+    "Speaker Deck",
+    "SlideShare",
+    "Prezi",
+    "Google Drive",
+    "Dropbox",
+    "OneDrive",
+    "VK",
+    "OK.ru",
+    "Rutube",
+    "Coub",
+    "9GAG",
+    "Imgur",
+    "Gfycat",
+    "Streamja",
+    "Clippit",
+    "Clyp",
+    "Vocaroo",
+    "Audius",
+    "Bandcamp",
+    "SoundCloud",
+    "Acast",
+    "Libsyn",
+    "Simplecast",
+    "Anchor",
+    "Spotify (podcasts)",
+    "iHeartRadio",
+    "TuneIn",
+    "Stitcher",
+    "Podbean",
+    "Megaphone",
+    "ARTE",
+    "3sat",
+    "ZDF",
+    "ARD",
+    "NDR",
+    "WDR",
+    "MDR",
+    "BR",
+    "SRF",
+    "RTS",
+    "RSI",
+    "RTR",
+    "ORF",
+    "TVP",
+    "CT",
+    "RTVS",
+    "RTV",
+    "HRT",
+    "RTL",
+    "SBS (KR)",
+    "KBS",
+    "MBN",
+    "JTBC",
+    "TV Asahi",
+    "Fuji TV",
+    "TVer",
+    "Abema",
+    "GYAO!",
+    "NicoNico",
+    "OpenRec",
+    "Mirrativ",
+    "TwitCasting",
+    "Showroom",
+    "Bigo",
+    "17.live",
+    "AfreecaTV",
+    "Naver TV",
+    "KakaoTV",
+    "V LIVE",
+    "Weverse",
+    "TikTok",
+    "Douyin",
+    "Kuaishou",
+    "Bilibili",
+    "AcFun",
+    "Weibo",
+    "Tencent Video",
+    "iQiyi",
+    "Youku",
+    "Mango TV",
+    "Sohu",
+    "LeTV",
+    "PPTV",
+    "QQ Music",
+    "NetEase",
+    "Ximalaya",
+    "YouTube (alternativas)",
+    "Invidious",
+    "Piped",
+]
diff --git a/lazier/downloader.py b/lazier/downloader.py
@@ -1,5 +1,5 @@
 """
-Módulo para download de vídeos do YouTube usando yt-dlp
+Módulo para download de vídeos do YouTube e outros sites usando yt-dlp
 """
 
 import os
@@ -9,16 +9,32 @@ import logging
 import re
 from pathlib import Path
 from typing import Optional, Dict, Any, Tuple, Type
+from urllib.parse import urlparse
 import yt_dlp
 
 from .core.exceptions import (
     YouTubeDownloadError,
     YouTubeVideoUnavailableError,
-    YouTubeAccessDeniedError
+    YouTubeAccessDeniedError,
+    MusicContentError,
 )
 
 logger = logging.getLogger(__name__)
 
+# Domínios exclusivamente de música: não processar em hipótese alguma
+MUSIC_ONLY_DOMAINS = frozenset({
+    'spotify.com', 'open.spotify.com', 'music.apple.com', 'itunes.apple.com',
+    'deezer.com', 'www.deezer.com', 'soundcloud.com', 'music.youtube.com',
+    'audius.co', 'bandcamp.com', 'tidal.com', 'napster.com', 'pandora.com',
+    'jiosaavn.com', 'gaana.com', 'wynk.in', 'hungama.com', 'yandexmusic.ru',
+})
+
+# Valores de category/genre/tags que indicam conteúdo de música
+MUSIC_CATEGORY_KEYWORDS = frozenset({
+    'music', 'música', 'musica', 'music video', 'music video clip',
+    'song', 'canção', 'cancao', 'album', 'single', 'mv', 'clip',
+})
+
 
 def _check_deno_available() -> bool:
     """Verifica se Deno está disponível no sistema"""
@@ -42,6 +58,49 @@ def _extract_error_code(error_str: str) -> Optional[str]:
     return None
 
 
+def is_music_domain(url: str) -> bool:
+    """Verifica se a URL pertence a um domínio exclusivamente de música."""
+    try:
+        parsed = urlparse(url)
+        netloc = (parsed.netloc or '').lower().strip()
+        if not netloc:
+            return False
+        # Remove www.
+        if netloc.startswith('www.'):
+            netloc = netloc[4:]
+        return netloc in MUSIC_ONLY_DOMAINS or any(
+            netloc.endswith('.' + d) for d in MUSIC_ONLY_DOMAINS
+        )
+    except Exception:
+        return False
+
+
+def is_music_content(info: Dict[str, Any]) -> bool:
+    """
+    Verifica se os metadados do extract_info indicam conteúdo de música.
+    Usado após extract_info (yt-dlp) para recusar processamento de música.
+    """
+    if not info:
+        return False
+    category = (info.get('categories') or []) if isinstance(info.get('categories'), list) else []
+    if not category and info.get('category'):
+        category = [info['category']]
+    genre = (info.get('genre') or '') if isinstance(info.get('genre'), str) else ''
+    tags = info.get('tags') or []
+    if isinstance(tags, str):
+        tags = [tags]
+    # YouTube: category id ou category string
+    yt_category = (info.get('categories') or [info.get('category')] if info.get('category') else [])
+    if isinstance(yt_category, str):
+        yt_category = [yt_category]
+    combined = ' '.join(
+        str(x).lower() for x in
+        (category + [genre] + list(tags)[:20] + list(yt_category))
+        if x
+    )
+    return any(kw in combined for kw in MUSIC_CATEGORY_KEYWORDS)
+
+
 def _classify_youtube_error(error: Exception) -> Tuple[Type[YouTubeDownloadError], bool]:
     """
     Classifica erro do YouTube e retorna (exception_class, should_retry)
@@ -136,9 +195,43 @@ def _create_ydl_opts(output_path: Path, format_str: str = 'bestaudio/best', use_
     return opts
 
 
+def _create_ydl_opts_generic(
+    output_path: Path,
+    format_str: str = 'bestaudio/best',
+    progress_hook=None,
+) -> Dict[str, Any]:
+    """
+    Cria opções yt-dlp genéricas (sem extractor_args específicos do YouTube).
+    Usado para qualquer site suportado pelo yt-dlp (TED, Reddit, Vimeo, etc.).
+    """
+    opts = {
+        'format': format_str,
+        'outtmpl': str(output_path / '%(title)s.%(ext)s'),
+        'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'nocheckcertificate': True,
+        'ignoreerrors': False,
+        'logtostderr': False,
+        'quiet': False,
+        'no_warnings': False,
+        'retries': 5,
+        'fragment_retries': 5,
+        'extractaudio': True,
+        'audioformat': 'mp3',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
+    }
+    if progress_hook:
+        opts['progress_hooks'] = [progress_hook]
+    return opts
+
+
 def download_youtube_audio(url: str, output_dir: Optional[str] = None) -> tuple[str, Dict[str, Any]]:
     """
-    Baixa o melhor áudio disponível de um vídeo do YouTube com retry logic e fallbacks
+    Baixa o melhor áudio disponível de um vídeo do YouTube com retry logic e fallbacks.
+    Em hipótese alguma processa conteúdo detectado como música.
     
     Args:
         url: URL do vídeo do YouTube
@@ -148,8 +241,13 @@ def download_youtube_audio(url: str, output_dir: Optional[str] = None) -> tuple[
         Tupla (caminho_do_arquivo, metadados)
     
     Raises:
+        MusicContentError: Se o conteúdo for detectado como música
         Exception: Se o download falhar após todas as tentativas
     """
+    if is_music_domain(url):
+        raise MusicContentError(
+            "Conteúdo detectado como música não é processado pelo Lazier."
+        )
     if output_dir is None:
         output_dir = tempfile.gettempdir()
     
@@ -232,6 +330,12 @@ def download_youtube_audio(url: str, output_dir: Optional[str] = None) -> tuple[
                         )
                     continue
                 
+                # Em hipótese alguma processar conteúdo detectado como música
+                if is_music_content(info):
+                    raise MusicContentError(
+                        "Conteúdo detectado como música não é processado pelo Lazier."
+                    )
+                
                 # Salva metadados importantes
                 metadata = {
                     'title': info.get('title', 'Sem título'),
@@ -288,6 +392,8 @@ def download_youtube_audio(url: str, output_dir: Optional[str] = None) -> tuple[
                     logger.info(f"Download bem-sucedido na tentativa {attempt}: {downloaded_file}")
                     break
                     
+        except MusicContentError:
+            raise
         except (YouTubeVideoUnavailableError, YouTubeAccessDeniedError, YouTubeDownloadError) as e:
             # Re-levanta exceções customizadas
             raise
@@ -338,3 +444,88 @@ def download_youtube_audio(url: str, output_dir: Optional[str] = None) -> tuple[
         )
     
     return downloaded_file, metadata
+
+
+def download_video_audio(url: str, output_dir: Optional[str] = None) -> tuple[str, Dict[str, Any]]:
+    """
+    Baixa o melhor áudio disponível de um vídeo de qualquer site suportado pelo yt-dlp
+    (TED, Reddit, Vimeo, etc.). Em hipótese alguma processa conteúdo detectado como música.
+    
+    Args:
+        url: URL do vídeo
+        output_dir: Diretório de saída (opcional, usa temp se None)
+    
+    Returns:
+        Tupla (caminho_do_arquivo, metadados)
+    
+    Raises:
+        MusicContentError: Se o conteúdo for detectado como música
+        Exception: Se a URL não for vídeo ou o download falhar
+    """
+    if is_music_domain(url):
+        raise MusicContentError(
+            "Conteúdo detectado como música não é processado pelo Lazier."
+        )
+    if output_dir is None:
+        output_dir = tempfile.gettempdir()
+    
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    
+    downloaded_file = None
+    
+    def progress_hook(d):
+        nonlocal downloaded_file
+        if d.get('status') == 'finished':
+            downloaded_file = d.get('filename')
+    
+    ydl_opts = _create_ydl_opts_generic(
+        output_path,
+        format_str='bestaudio/best',
+        progress_hook=progress_hook,
+    )
+    
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        try:
+            info = ydl.extract_info(url, download=False)
+        except (yt_dlp.utils.DownloadError, yt_dlp.utils.ExtractorError) as e:
+            raise YouTubeDownloadError(
+                f"URL não é um vídeo suportado ou falha ao extrair: {e}",
+                original_error=e,
+            )
+        
+        if not info:
+            raise YouTubeDownloadError("Não foi possível obter informações do vídeo.")
+        
+        if is_music_content(info):
+            raise MusicContentError(
+                "Conteúdo detectado como música não é processado pelo Lazier."
+            )
+        
+        metadata = {
+            'title': info.get('title', 'Sem título'),
+            'duration': info.get('duration'),
+            'uploader': info.get('uploader', 'Desconhecido'),
+            'upload_date': info.get('upload_date'),
+            'description': info.get('description', ''),
+            'webpage_url': info.get('webpage_url', url),
+        }
+        
+        ydl.download([url])
+        
+        if downloaded_file is None or not os.path.exists(downloaded_file):
+            expected_filename = ydl.prepare_filename(info)
+            if os.path.exists(expected_filename):
+                downloaded_file = expected_filename
+            else:
+                files = [f for f in output_path.glob('*') if f.is_file()]
+                if files:
+                    downloaded_file = str(max(files, key=os.path.getmtime))
+        
+        if not downloaded_file or not os.path.exists(downloaded_file):
+            raise YouTubeDownloadError(
+                "Arquivo baixado não encontrado após download.",
+                original_error=None,
+            )
+    
+    return downloaded_file, metadata
diff --git a/lazier/web/templates/index.html b/lazier/web/templates/index.html
@@ -411,6 +411,54 @@
             box-shadow: 0 0 0 3px rgba(255, 179, 0, 0.1);
         }
 
+        /* ===== SITES SUPORTADOS ===== */
+        .supported-sites-block {
+            margin-bottom: var(--spacing-lg);
+            padding: var(--spacing-md);
+            background: linear-gradient(135deg, rgba(40, 53, 147, 0.04) 0%, rgba(255, 179, 0, 0.06) 100%);
+            border-radius: var(--border-radius-md);
+            border: 1px solid var(--color-border-light);
+        }
+        .supported-sites-title {
+            font-size: clamp(0.85rem, 2vw, 0.95rem);
+            color: var(--color-primary);
+            margin-bottom: var(--spacing-sm);
+            font-weight: 600;
+        }
+        .supported-sites-list {
+            max-height: 200px;
+            overflow-y: auto;
+            overflow-x: hidden;
+            padding: var(--spacing-sm) 0;
+            display: flex;
+            flex-wrap: wrap;
+            gap: var(--spacing-xs);
+            align-content: flex-start;
+        }
+        .supported-sites-list .site-tag {
+            display: inline-block;
+            padding: var(--spacing-xs) var(--spacing-sm);
+            background: rgba(255, 255, 255, 0.9);
+            border: 1px solid var(--color-border);
+            border-radius: var(--border-radius-sm);
+            font-size: 0.75rem;
+            color: var(--color-text);
+        }
+        .supported-sites-link {
+            display: inline-block;
+            margin-top: var(--spacing-sm);
+            font-size: 0.8rem;
+            color: var(--color-primary);
+            text-decoration: none;
+        }
+        .supported-sites-link:hover {
+            text-decoration: underline;
+        }
+        .supported-sites-loading {
+            color: var(--color-text-light);
+            font-size: 0.85rem;
+        }
+
         /* ===== PROCESSING OPTIONS ===== */
         .processing-options {
             margin: var(--spacing-lg) 0;
@@ -1141,6 +1189,15 @@
                 
                 <input type="text" id="urlInput" class="url-input" placeholder="Ou cole uma URL do YouTube ou página web aqui...">
                 
+                <!-- Sites suportados (vídeo/áudio) -->
+                <div class="supported-sites-block" style="flex-shrink: 0;">
+                    <h3 class="supported-sites-title">Cole a URL de vídeo/áudio de qualquer um destes sites (e muitos outros):</h3>
+                    <div class="supported-sites-list" id="supportedSitesList" aria-label="Lista de sites suportados">
+                        <span class="supported-sites-loading" id="supportedSitesLoading">Carregando...</span>
+                    </div>
+                    <a href="https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md" target="_blank" rel="noopener noreferrer" class="supported-sites-link">Ver lista completa no yt-dlp ↗</a>
+                </div>
+                
                 <!-- Opções de Processamento -->
                 <div class="processing-options" style="flex-shrink: 0;">
                     <h3>Modo de Processamento</h3>
@@ -1177,6 +1234,7 @@
                             <option value="txt">TXT</option>
                             <option value="md">Markdown</option>
                             <option value="json">JSON</option>
+                            <option value="pdf">PDF</option>
                         </select>
                     </div>
                     <div class="option-group">
@@ -1239,10 +1297,26 @@
         let allJobs = [];
         let currentFilter = 'all';
         
+        // Carregar lista de sites suportados
+        async function loadSupportedSites() {
+            const listEl = document.getElementById('supportedSitesList');
+            const loadingEl = document.getElementById('supportedSitesLoading');
+            try {
+                const response = await fetch('/api/supported-sites', { credentials: 'include' });
+                const data = await response.json();
+                const sites = data.sites || [];
+                loadingEl.style.display = 'none';
+                listEl.innerHTML = sites.map(s => `<span class="site-tag">${escapeHtml(s)}</span>`).join('');
+            } catch (e) {
+                loadingEl.textContent = 'Não foi possível carregar a lista.';
+            }
+        }
+        
         // Inicialização
         document.addEventListener('DOMContentLoaded', () => {
             document.getElementById('currentYear').textContent = new Date().getFullYear();
             loadHistory();
+            loadSupportedSites();
             
             // Animações de entrada
             const elements = document.querySelectorAll('.page-content > *');
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ dependencies = [
     "aiofiles>=23.2.0",
     "redis>=5.0.0",
     "hiredis>=2.2.0",
+    "reportlab>=4.0.0",
 ]
 
 [project.scripts]
diff --git a/requirements.txt b/requirements.txt
@@ -20,3 +20,4 @@ playwright>=1.40.0
 chardet>=5.0.0
 passlib[bcrypt]>=1.7.4
 itsdangerous>=2.1.2
+reportlab>=4.0.0

A	.dockerignore	\|	61	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	README.md	\|	26	+++++++++++++++++++++++++-
M	lazier/api/routes.py	\|	132	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
M	lazier/cli.py	\|	51	+++++++++++++++++++++++++++++++++++++++++++++++----
M	lazier/core/exceptions.py	\|	5	+++++
M	lazier/core/formats.py	\|	97	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
M	lazier/core/playlist.py	\|	2	+-
A	lazier/core/supported_sites.py	\|	199	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	lazier/downloader.py	\|	197	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
M	lazier/web/templates/index.html	\|	74	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	pyproject.toml	\|	1	+
M	requirements.txt	\|	1	+