lazier

personal summarizer
Log | Files | Refs | README

summarizer.py (34816B)


      1 """
      2 Sumario e renderizacao em portugues usando os modelos de chat da OpenAI.
      3 
      4 Inclui dois caminhos:
      5 
      6 - `summarize_text` (legado) -> texto livre, mantido por compatibilidade.
      7 - `summarize_smart` -> Structured Outputs com schema `SmartSummary` (TL;DR,
      8   key points, decisoes, action items, topicos, citacoes, perguntas em aberto).
      9   E o caminho preferido quando `OPENAI_ENABLE_SMART_SUMMARY=true`.
     10 
     11 A `render_text_in_portuguese` continua igual, agora usando o modelo do
     12 `get_model_config()` quando o caller nao especifica um.
     13 """
     14 
     15 from __future__ import annotations
     16 
     17 import json
     18 import os
     19 from concurrent.futures import ThreadPoolExecutor, as_completed
     20 from typing import Any, Dict, List, Optional, Tuple
     21 
     22 try:
     23     from openai import OpenAI
     24 except ImportError:  # pragma: no cover - ambiente sem openai
     25     OpenAI = None
     26 try:
     27     from dotenv import load_dotenv
     28 except ImportError:  # pragma: no cover - ambiente sem python-dotenv
     29     def load_dotenv():
     30         return False
     31 try:
     32     from pydantic import BaseModel, Field
     33 except ImportError:  # pragma: no cover - pydantic indisponivel
     34     BaseModel = None  # type: ignore[misc,assignment]
     35 
     36     def Field(*args, **kwargs):  # type: ignore[misc]
     37         return None
     38 
     39 from .web.extractor import extract_pdf_content, extract_text_file_content, extract_web_content
     40 from .core.config import VALID_REASONING_EFFORTS, get_model_config
     41 
     42 load_dotenv()
     43 
     44 
     45 # ---------------------------------------------------------------------------
     46 # Schema do sumario inteligente
     47 # ---------------------------------------------------------------------------
     48 
     49 if BaseModel is not None:
     50 
     51     class ActionItem(BaseModel):
     52         owner: str = Field(default="", description="Responsavel ou pessoa indicada")
     53         task: str = Field(default="", description="Acao a ser executada")
     54         due_hint: str = Field(default="", description="Prazo, data ou referencia temporal")
     55 
     56 
     57     class SmartSummary(BaseModel):
     58         tldr: str = Field(default="", description="Sumario de 1-3 frases")
     59         key_points: List[str] = Field(default_factory=list)
     60         decisions: List[str] = Field(default_factory=list)
     61         action_items: List[ActionItem] = Field(default_factory=list)
     62         topics: List[str] = Field(default_factory=list)
     63         quotes: List[str] = Field(default_factory=list)
     64         open_questions: List[str] = Field(default_factory=list)
     65 
     66 else:  # pragma: no cover - apenas seguranca em ambientes sem pydantic
     67     ActionItem = dict  # type: ignore[misc,assignment]
     68     SmartSummary = dict  # type: ignore[misc,assignment]
     69 
     70 
     71 # Schema explicito para `response_format=json_schema` (mais seguro que confiar
     72 # em `model_json_schema()` para garantir strict-mode).
     73 SMART_SUMMARY_JSON_SCHEMA: Dict[str, Any] = {
     74     "name": "SmartSummary",
     75     "strict": True,
     76     "schema": {
     77         "type": "object",
     78         "additionalProperties": False,
     79         "required": [
     80             "tldr",
     81             "key_points",
     82             "decisions",
     83             "action_items",
     84             "topics",
     85             "quotes",
     86             "open_questions",
     87         ],
     88         "properties": {
     89             "tldr": {"type": "string"},
     90             "key_points": {"type": "array", "items": {"type": "string"}},
     91             "decisions": {"type": "array", "items": {"type": "string"}},
     92             "action_items": {
     93                 "type": "array",
     94                 "items": {
     95                     "type": "object",
     96                     "additionalProperties": False,
     97                     "required": ["owner", "task", "due_hint"],
     98                     "properties": {
     99                         "owner": {"type": "string"},
    100                         "task": {"type": "string"},
    101                         "due_hint": {"type": "string"},
    102                     },
    103                 },
    104             },
    105             "topics": {"type": "array", "items": {"type": "string"}},
    106             "quotes": {"type": "array", "items": {"type": "string"}},
    107             "open_questions": {"type": "array", "items": {"type": "string"}},
    108         },
    109     },
    110 }
    111 
    112 
    113 # Personas/instrucoes complementares por tipo de conteudo.
    114 PERSONA_HINTS: Dict[str, str] = {
    115     "lecture": (
    116         "O texto e uma palestra ou aula. Enfatize conceitos-chave, definicoes, "
    117         "exemplos e a sequencia logica de raciocinio. Em action_items, registre "
    118         "exercicios ou leituras sugeridas."
    119     ),
    120     "podcast": (
    121         "O texto e um episodio de podcast. Destaque tese central, opinioes dos "
    122         "convidados, exemplos e historias contadas. Em quotes, prefira trechos "
    123         "marcantes; em action_items, recomendacoes praticas dadas ao ouvinte."
    124     ),
    125     "interview": (
    126         "O texto e uma entrevista. Estruture key_points por blocos tematicos das "
    127         "perguntas, e em quotes registre as respostas mais reveladoras."
    128     ),
    129     "news": (
    130         "O texto e jornalistico. Foque em fatos verificaveis, numeros, fontes "
    131         "citadas e linha do tempo. Em open_questions, deixe lacunas relatadas "
    132         "pela propria materia."
    133     ),
    134     "tutorial": (
    135         "O texto e um tutorial ou how-to. Em key_points, liste passos na ordem; "
    136         "em action_items, transforme cada passo em uma acao clara com pre-req."
    137     ),
    138     "meeting": (
    139         "O texto e uma reuniao. Seja muito explicito em decisions e action_items "
    140         "(owner + task + due_hint). Em open_questions, registre pendencias."
    141     ),
    142     "tech_doc": (
    143         "O texto e um documento tecnico. Preserve nomes de APIs, parametros, "
    144         "constantes e pre-requisitos em key_points."
    145     ),
    146     "other": "",
    147 }
    148 
    149 
    150 # Limite de chars por chamada (~tokens muito generoso). Reduz risco de hit no
    151 # input_tokens limit em modelos com janela menor.
    152 DEFAULT_CHUNK_CHAR_LIMIT = 300_000
    153 
    154 
    155 # ---------------------------------------------------------------------------
    156 # Infra OpenAI
    157 # ---------------------------------------------------------------------------
    158 
    159 def _ensure_client() -> "OpenAI":
    160     api_key = os.getenv("OPENAI_API_KEY")
    161     if not api_key:
    162         raise Exception(
    163             "OPENAI_API_KEY nao encontrada. "
    164             "Configure a variavel de ambiente OPENAI_API_KEY ou crie um arquivo .env"
    165         )
    166     if OpenAI is None:
    167         raise Exception("openai nao esta instalado neste ambiente.")
    168     return OpenAI(api_key=api_key)
    169 
    170 
    171 def _wrap_chat_error(exc: Exception, action: str) -> Exception:
    172     error_msg = str(exc)
    173     lowered = error_msg.lower()
    174     if "api_key" in lowered or "authentication" in lowered:
    175         return Exception("Erro de autenticacao com OpenAI API. Verifique sua OPENAI_API_KEY.")
    176     return Exception(f"Erro ao {action}: {error_msg}")
    177 
    178 
    179 def _chat_completions_kwargs(
    180     *,
    181     model: str,
    182     messages: List[Dict[str, Any]],
    183     response_format: Optional[Dict[str, Any]] = None,
    184     temperature: Optional[float] = None,
    185     reasoning_effort: Optional[str] = None,
    186 ) -> Dict[str, Any]:
    187     """Monta kwargs respeitando peculiaridades por familia de modelo."""
    188 
    189     config = get_model_config()
    190     kwargs: Dict[str, Any] = {"model": model, "messages": messages}
    191     if response_format is not None:
    192         kwargs["response_format"] = response_format
    193     effort = (reasoning_effort if reasoning_effort is not None else config.reasoning_effort).lower()
    194     if effort not in VALID_REASONING_EFFORTS:
    195         effort = config.reasoning_effort
    196     if config.supports_reasoning(model):
    197         kwargs["reasoning_effort"] = effort
    198     elif temperature is not None:
    199         # Modelos que nao sao da familia gpt-5/o aceitam temperature normalmente.
    200         kwargs["temperature"] = temperature
    201     return kwargs
    202 
    203 
    204 # ---------------------------------------------------------------------------
    205 # Conversao para PT-BR (legacy mantido)
    206 # ---------------------------------------------------------------------------
    207 
    208 def render_text_in_portuguese(
    209     text: str,
    210     model: Optional[str] = None,
    211     reasoning_effort: Optional[str] = None,
    212 ) -> str:
    213     """Converte qualquer texto para portugues do Brasil preservando detalhes."""
    214 
    215     if not text or not text.strip():
    216         return ""
    217 
    218     config = get_model_config()
    219     chosen_model = model or config.chat_model
    220 
    221     max_chars = DEFAULT_CHUNK_CHAR_LIMIT
    222     if len(text) <= max_chars:
    223         return _render_portuguese_chunk(text, chosen_model, reasoning_effort=reasoning_effort)
    224 
    225     chunks = _split_text_into_chunks(text, max_chars)
    226     rendered_chunks: List[str] = []
    227     print(f"Texto longo detectado ({len(text)} caracteres). Convertendo {len(chunks)} partes para portugues...")
    228     for i, chunk in enumerate(chunks):
    229         print(f"Convertendo parte {i + 1}/{len(chunks)}...")
    230         rendered_chunks.append(_render_portuguese_chunk(chunk, chosen_model, reasoning_effort=reasoning_effort))
    231     return "\n\n".join(chunk.strip() for chunk in rendered_chunks if chunk.strip())
    232 
    233 
    234 def _render_portuguese_chunk(text: str, model: str, reasoning_effort: Optional[str] = None) -> str:
    235     client = _ensure_client()
    236     prompt = (
    237         "Converta o texto a seguir para portugues do Brasil.\n\n"
    238         "Regras:\n"
    239         "- Se o texto ja estiver em portugues, mantenha em portugues do Brasil natural.\n"
    240         "- Nao resuma, nao explique, nao comente o texto.\n"
    241         "- Preserve nomes proprios, numeros, datas, links, listas e estrutura.\n"
    242         "- Mantenha o maximo de fidelidade possivel ao conteudo original.\n"
    243         "- Use acentuacao e ortografia corretas em portugues do Brasil.\n\n"
    244         "Texto:\n\n"
    245     )
    246 
    247     try:
    248         kwargs = _chat_completions_kwargs(
    249             model=model,
    250             messages=[
    251                 {
    252                     "role": "system",
    253                     "content": "Voce e um tradutor tecnico e editorial que preserva fielmente o conteudo.",
    254                 },
    255                 {"role": "user", "content": prompt + text},
    256             ],
    257             temperature=0.1,
    258             reasoning_effort=reasoning_effort,
    259         )
    260         response = client.chat.completions.create(**kwargs)
    261         return (response.choices[0].message.content or "").strip()
    262     except Exception as exc:
    263         raise _wrap_chat_error(exc, "converter texto para portugues") from exc
    264 
    265 
    266 POLISH_PT_BR_CHUNK_CHARS = 24_000
    267 
    268 
    269 def polish_pt_br_text(
    270     text: str,
    271     kind: str = "transcription",
    272     model: Optional[str] = None,
    273     reasoning_effort: Optional[str] = None,
    274 ) -> str:
    275     """
    276     Revisao ortografica leve em pt-BR: acentos, grafia, paragrafos.
    277     Nao altera factos, nomes proprios, numeros, links nem marcadores **Falante N**.
    278     """
    279 
    280     if not text or not text.strip():
    281         return text or ""
    282 
    283     config = get_model_config()
    284     chosen_model = model or config.chat_model
    285     max_chars = min(POLISH_PT_BR_CHUNK_CHARS, DEFAULT_CHUNK_CHAR_LIMIT)
    286 
    287     kind_norm = (kind or "transcription").strip().lower()
    288     if kind_norm == "summary":
    289         kind_hint = (
    290             "Este trecho e um sumario ou sintese: mantenha tom conciso e bullet/numeracao se ja existir."
    291         )
    292     else:
    293         kind_hint = (
    294             "Este trecho e transcricao ou texto corrido: preserve a ordem e falas; "
    295             "nao resuma nem omita frases."
    296         )
    297 
    298     if len(text) <= max_chars:
    299         return _polish_pt_br_chunk(text, chosen_model, kind_hint, reasoning_effort=reasoning_effort)
    300 
    301     chunks = _split_text_into_chunks(text, max_chars)
    302     out: List[str] = []
    303     for i, chunk in enumerate(chunks):
    304         print(f"Revisao ortografica pt-BR: parte {i + 1}/{len(chunks)}...")
    305         out.append(_polish_pt_br_chunk(chunk, chosen_model, kind_hint, reasoning_effort=reasoning_effort))
    306     return "\n\n".join(c.strip() for c in out if c.strip())
    307 
    308 
    309 def _polish_pt_br_chunk(
    310     text: str,
    311     model: str,
    312     kind_hint: str,
    313     reasoning_effort: Optional[str] = None,
    314 ) -> str:
    315     client = _ensure_client()
    316     prompt = (
    317         "Revise o texto em portugues do Brasil.\n\n"
    318         f"{kind_hint}\n\n"
    319         "Regras obrigatorias:\n"
    320         "- Corrija acentuacao e ortografia em pt-BR.\n"
    321         "- Melhore quebras de linha e paragrafos quando ajudar a leitura.\n"
    322         "- Nao altere factos, dados, numeros, datas, nomes proprios, codigos, URLs.\n"
    323         "- Preserve linhas que comecam com **Falante N** e a estrutura logo abaixo delas.\n"
    324         "- Nao adicione comentarios meta; devolva apenas o texto revisado.\n\n"
    325         "Texto:\n\n"
    326     )
    327     try:
    328         kwargs = _chat_completions_kwargs(
    329             model=model,
    330             messages=[
    331                 {
    332                     "role": "system",
    333                     "content": "Voce e um revisor editorial em portugues do Brasil, preciso e conservador.",
    334                 },
    335                 {"role": "user", "content": prompt + text},
    336             ],
    337             temperature=0.05,
    338             reasoning_effort=reasoning_effort,
    339         )
    340         response = client.chat.completions.create(**kwargs)
    341         return (response.choices[0].message.content or "").strip()
    342     except Exception as exc:
    343         raise _wrap_chat_error(exc, "revisar ortografia em portugues") from exc
    344 
    345 
    346 # ---------------------------------------------------------------------------
    347 # Sumario textual legado
    348 # ---------------------------------------------------------------------------
    349 
    350 def summarize_text(
    351     text: str,
    352     model: Optional[str] = None,
    353     language: str = "pt-BR",
    354     reasoning_effort: Optional[str] = None,
    355 ) -> str:
    356     """Sumariza texto em formato livre (caminho legado)."""
    357 
    358     if not text or not text.strip():
    359         return "Texto vazio - nao e possivel gerar sumario."
    360 
    361     config = get_model_config()
    362     chosen_model = model or config.chat_model
    363 
    364     use_hier = config.hierarchical_summary and len(text) > config.summary_direct_max_chars
    365     if use_hier:
    366         from .core.long_summary import chunk_text_with_overlap
    367 
    368         chunks = chunk_text_with_overlap(
    369             text.strip(),
    370             config.summary_map_chunk_chars,
    371             config.summary_chunk_overlap_chars,
    372         )
    373         if not chunks:
    374             return "Texto vazio - nao e possivel gerar sumario."
    375         if len(chunks) == 1:
    376             return _summarize_chunk(chunks[0], chosen_model, language, reasoning_effort=reasoning_effort)
    377         chunk_summaries: List[str] = []
    378         print(
    379             f"Sumario legado hierarquico: {len(text)} caracteres -> {len(chunks)} chunks "
    380             f"(overlap {config.summary_chunk_overlap_chars})."
    381         )
    382         for i, chunk in enumerate(chunks):
    383             try:
    384                 print(f"Sumarizando parte {i + 1}/{len(chunks)}...")
    385                 chunk_summaries.append(
    386                     _summarize_chunk(chunk, chosen_model, language, reasoning_effort=reasoning_effort)
    387                 )
    388             except Exception as exc:
    389                 print(f"Erro ao sumarizar parte {i + 1}: {exc}")
    390                 chunk_summaries.append(f"[Erro nesta parte: {exc}]")
    391 
    392         valid_summaries = [s for s in chunk_summaries if not s.startswith("[Erro")]
    393         if not valid_summaries:
    394             return "Falha ao gerar sumario: todas as partes falharam."
    395         combined = "\n\n".join(valid_summaries)
    396         return _summarize_chunk(
    397             combined,
    398             chosen_model,
    399             language,
    400             is_final=True,
    401             reasoning_effort=reasoning_effort,
    402         )
    403 
    404     max_chars = 400_000
    405 
    406     if len(text) <= max_chars:
    407         return _summarize_chunk(text, chosen_model, language, reasoning_effort=reasoning_effort)
    408 
    409     chunks = _split_text_into_chunks(text, max_chars)
    410     chunk_summaries: List[str] = []
    411     print(f"Texto longo detectado ({len(text)} caracteres). Dividido em {len(chunks)} partes.")
    412     for i, chunk in enumerate(chunks):
    413         try:
    414             print(f"Sumarizando parte {i + 1}/{len(chunks)}...")
    415             chunk_summaries.append(_summarize_chunk(chunk, chosen_model, language, reasoning_effort=reasoning_effort))
    416         except Exception as exc:
    417             print(f"Erro ao sumarizar parte {i + 1}: {exc}")
    418             chunk_summaries.append(f"[Erro nesta parte: {exc}]")
    419 
    420     valid_summaries = [s for s in chunk_summaries if not s.startswith("[Erro")]
    421     if not valid_summaries:
    422         return "Falha ao gerar sumario: todas as partes falharam."
    423     if len(valid_summaries) > 1:
    424         print("Consolidando sumarios parciais...")
    425         combined = "\n\n".join(valid_summaries)
    426         return _summarize_chunk(
    427             combined,
    428             chosen_model,
    429             language,
    430             is_final=True,
    431             reasoning_effort=reasoning_effort,
    432         )
    433     return valid_summaries[0]
    434 
    435 
    436 def _summarize_chunk(
    437     text: str,
    438     model: str,
    439     language: str,
    440     is_final: bool = False,
    441     reasoning_effort: Optional[str] = None,
    442 ) -> str:
    443     client = _ensure_client()
    444 
    445     if language.startswith("pt"):
    446         prompt = (
    447             "Voce e um assistente especializado em criar sumarios detalhados e completos.\n\n"
    448             "Crie um sumario COMPLETO e DETALHADO do seguinte texto em portugues do Brasil. O sumario DEVE:\n"
    449             "- Manter TODOS os pontos importantes, chave e informacoes relevantes\n"
    450             "- Preservar numeros, datas, nomes, estatisticas e dados tecnicos\n"
    451             "- Manter a estrutura logica e sequencia do conteudo original\n"
    452             "- Ser detalhado o suficiente para nao perder informacoes essenciais\n"
    453             "- Destacar os principais temas e subtemas\n"
    454             "- Ser escrito em portugues do Brasil\n\n"
    455             "Texto para sumarizar:\n\n"
    456         )
    457         if is_final:
    458             prompt = (
    459                 "Voce recebeu multiplos sumarios parciais de um texto longo. Crie um sumario final "
    460                 "unificado e COMPLETO em portugues do Brasil que integre TODOS os pontos principais, "
    461                 "preserve numeros/datas/nomes e seja coerente.\n\nSumarios parciais:\n\n"
    462             )
    463     else:
    464         prompt = (
    465             f"You are an assistant specialized in creating concise and informative summaries.\n\n"
    466             f"Please create a detailed summary of the following text in {language}.\n\nText:\n\n"
    467         )
    468 
    469     try:
    470         kwargs = _chat_completions_kwargs(
    471             model=model,
    472             messages=[
    473                 {
    474                     "role": "system",
    475                     "content": "You are a helpful assistant that creates detailed and comprehensive summaries.",
    476                 },
    477                 {"role": "user", "content": prompt + text},
    478             ],
    479             temperature=0.2,
    480             reasoning_effort=reasoning_effort,
    481         )
    482         response = client.chat.completions.create(**kwargs)
    483         return (response.choices[0].message.content or "").strip()
    484     except Exception as exc:
    485         raise _wrap_chat_error(exc, "sumarizar texto") from exc
    486 
    487 
    488 # ---------------------------------------------------------------------------
    489 # Sumario inteligente (Structured Outputs)
    490 # ---------------------------------------------------------------------------
    491 
    492 def summarize_smart(
    493     text: str,
    494     *,
    495     model: Optional[str] = None,
    496     content_type: Optional[str] = None,
    497     language: str = "pt-BR",
    498     reasoning_effort: Optional[str] = None,
    499 ) -> Dict[str, Any]:
    500     """Gera sumario estruturado e devolve dicionario serializavel.
    501 
    502     Sempre devolve um dict no formato `SmartSummary` (mesmo em fallback). Se a
    503     chamada com structured outputs falhar, faz uma tentativa adicional com
    504     `json_object`. Se ainda assim falhar, embrulha o texto livre em `tldr`.
    505     """
    506 
    507     if BaseModel is None:
    508         raise Exception("pydantic nao esta instalado neste ambiente.")
    509 
    510     if not text or not text.strip():
    511         return SmartSummary(tldr="Texto vazio - nao e possivel gerar sumario.").model_dump()
    512 
    513     config = get_model_config()
    514     chosen_model = model or config.chat_model
    515 
    516     use_hier = config.hierarchical_summary and len(text) > config.summary_direct_max_chars
    517     if use_hier:
    518         from .core.long_summary import chunk_text_with_overlap
    519 
    520         chunks = chunk_text_with_overlap(
    521             text.strip(),
    522             config.summary_map_chunk_chars,
    523             config.summary_chunk_overlap_chars,
    524         )
    525         if not chunks:
    526             return SmartSummary(tldr="Texto vazio - nao e possivel gerar sumario.").model_dump()
    527         print(
    528             f"Sumario inteligente hierarquico: {len(text)} chars -> {len(chunks)} chunks "
    529             f"(overlap {config.summary_chunk_overlap_chars})."
    530         )
    531         if len(chunks) == 1:
    532             return _summarize_smart_chunk(
    533                 chunks[0],
    534                 chosen_model,
    535                 content_type,
    536                 language,
    537                 reasoning_effort=reasoning_effort,
    538             ).model_dump()
    539         parciais_h = _collect_smart_partials(
    540             chunks,
    541             chosen_model,
    542             content_type,
    543             language,
    544             reasoning_effort,
    545             label="Sumario inteligente hierarquico",
    546         )
    547         if not parciais_h:
    548             return SmartSummary(tldr="Falha ao gerar sumario: todas as partes falharam.").model_dump()
    549         if len(parciais_h) == 1:
    550             return parciais_h[0].model_dump()
    551         return _merge_smart_summaries(
    552             parciais_h,
    553             chosen_model,
    554             content_type,
    555             reasoning_effort=reasoning_effort,
    556         ).model_dump()
    557 
    558     chunks = _split_text_into_chunks(text, DEFAULT_CHUNK_CHAR_LIMIT) if len(text) > DEFAULT_CHUNK_CHAR_LIMIT else [text]
    559 
    560     if len(chunks) == 1:
    561         return _summarize_smart_chunk(
    562             chunks[0],
    563             chosen_model,
    564             content_type,
    565             language,
    566             reasoning_effort=reasoning_effort,
    567         ).model_dump()
    568 
    569     print(f"Texto longo detectado ({len(text)} chars) - rodando sumario inteligente em {len(chunks)} partes.")
    570     parciais = _collect_smart_partials(
    571         chunks,
    572         chosen_model,
    573         content_type,
    574         language,
    575         reasoning_effort,
    576         label="Sumario inteligente",
    577     )
    578 
    579     if not parciais:
    580         return SmartSummary(tldr="Falha ao gerar sumario: todas as partes falharam.").model_dump()
    581 
    582     if len(parciais) == 1:
    583         return parciais[0].model_dump()
    584 
    585     return _merge_smart_summaries(
    586         parciais,
    587         chosen_model,
    588         content_type,
    589         reasoning_effort=reasoning_effort,
    590     ).model_dump()
    591 
    592 
    593 def _build_smart_messages(
    594     text: str,
    595     content_type: Optional[str],
    596     language: str,
    597     is_merge: bool = False,
    598 ) -> List[Dict[str, Any]]:
    599     persona_extra = PERSONA_HINTS.get((content_type or "other").lower(), "")
    600 
    601     system = (
    602         "Voce e um analista que cria sumarios estruturados de altissima qualidade em "
    603         f"{'portugues do Brasil' if language.startswith('pt') else language}. "
    604         "Mantenha fidelidade ao texto, sem inventar dados. Quando um campo nao tiver "
    605         "informacao, devolva uma lista vazia ou string vazia."
    606     )
    607     if persona_extra:
    608         system += " " + persona_extra
    609 
    610     if is_merge:
    611         user = (
    612             "Voce recebera varios sumarios estruturados parciais de um mesmo texto longo. "
    613             "Consolide-os em UM unico SmartSummary final, eliminando duplicatas, mantendo "
    614             "todos os pontos relevantes e preservando numeros, datas e nomes.\n\n"
    615             "Sumarios parciais (JSON):\n\n" + text
    616         )
    617     else:
    618         user = (
    619             "Crie um SmartSummary do texto abaixo. Regras:\n"
    620             "- tldr: 1-3 frases.\n"
    621             "- key_points: pontos principais em ordem logica.\n"
    622             "- decisions: decisoes explicitas tomadas no texto (vazio se nao houver).\n"
    623             "- action_items: tarefas concretas com owner/task/due_hint quando possivel.\n"
    624             "- topics: temas curtos (2-4 palavras cada).\n"
    625             "- quotes: trechos literais marcantes.\n"
    626             "- open_questions: perguntas em aberto ou pontos nao resolvidos.\n\n"
    627             "Texto:\n\n" + text
    628         )
    629 
    630     return [
    631         {"role": "system", "content": system},
    632         {"role": "user", "content": user},
    633     ]
    634 
    635 
    636 def _summarize_smart_chunk(
    637     text: str,
    638     model: str,
    639     content_type: Optional[str],
    640     language: str,
    641     *,
    642     reasoning_effort: Optional[str] = None,
    643 ) -> SmartSummary:
    644     client = _ensure_client()
    645     messages = _build_smart_messages(text, content_type, language, is_merge=False)
    646 
    647     try:
    648         kwargs = _chat_completions_kwargs(
    649             model=model,
    650             messages=messages,
    651             response_format={"type": "json_schema", "json_schema": SMART_SUMMARY_JSON_SCHEMA},
    652             temperature=0.2,
    653             reasoning_effort=reasoning_effort,
    654         )
    655         response = client.chat.completions.create(**kwargs)
    656         raw = response.choices[0].message.content or "{}"
    657         return _parse_smart_summary(raw)
    658     except Exception as exc:
    659         # Fallback: pede JSON livre e tenta parsear.
    660         print(f"Aviso: structured outputs falhou ({exc}); tentando json_object...")
    661         return _summarize_smart_fallback_json(client, model, messages, reasoning_effort=reasoning_effort)
    662 
    663 
    664 def _collect_smart_partials(
    665     chunks: List[str],
    666     chosen_model: str,
    667     content_type: Optional[str],
    668     language: str,
    669     reasoning_effort: Optional[str],
    670     *,
    671     label: str = "Sumario inteligente",
    672 ) -> List[SmartSummary]:
    673     """Map de chunks para SmartSummary; paralelo quando LAZIER_SUMMARY_PARALLEL_WORKERS > 1."""
    674     config = get_model_config()
    675     total = len(chunks)
    676     workers = min(config.summary_parallel_workers, total) if total > 1 else 1
    677 
    678     def _one(index: int, chunk: str) -> Tuple[int, Optional[SmartSummary]]:
    679         print(f"{label} parte {index + 1}/{total}...")
    680         try:
    681             return index, _summarize_smart_chunk(
    682                 chunk,
    683                 chosen_model,
    684                 content_type,
    685                 language,
    686                 reasoning_effort=reasoning_effort,
    687             )
    688         except Exception as exc:
    689             print(f"Erro ao sumarizar parte {index + 1}: {exc}")
    690             return index, None
    691 
    692     if workers <= 1:
    693         ordered: List[SmartSummary] = []
    694         for index, chunk in enumerate(chunks):
    695             _, partial = _one(index, chunk)
    696             if partial:
    697                 ordered.append(partial)
    698         return ordered
    699 
    700     slots: List[Optional[SmartSummary]] = [None] * total
    701     with ThreadPoolExecutor(max_workers=workers) as pool:
    702         futures = [pool.submit(_one, index, chunk) for index, chunk in enumerate(chunks)]
    703         for future in as_completed(futures):
    704             idx, partial = future.result()
    705             if partial:
    706                 slots[idx] = partial
    707     return [item for item in slots if item is not None]
    708 
    709 
    710 def _summarize_smart_fallback_json(
    711     client: "OpenAI",
    712     model: str,
    713     messages: List[Dict[str, Any]],
    714     *,
    715     reasoning_effort: Optional[str] = None,
    716 ) -> SmartSummary:
    717     fallback_messages = list(messages)
    718     fallback_messages[0] = {
    719         **messages[0],
    720         "content": messages[0]["content"]
    721         + " Devolva apenas um JSON valido com as chaves: tldr, key_points, decisions, "
    722           "action_items (lista de objetos com owner/task/due_hint), topics, quotes, open_questions.",
    723     }
    724 
    725     try:
    726         kwargs = _chat_completions_kwargs(
    727             model=model,
    728             messages=fallback_messages,
    729             response_format={"type": "json_object"},
    730             temperature=0.2,
    731             reasoning_effort=reasoning_effort,
    732         )
    733         response = client.chat.completions.create(**kwargs)
    734         raw = response.choices[0].message.content or "{}"
    735         return _parse_smart_summary(raw)
    736     except Exception as exc:
    737         print(f"Aviso: fallback JSON tambem falhou ({exc}); embrulhando texto livre.")
    738         # Ultimo recurso: roda o sumario textual e enfia tudo em tldr.
    739         try:
    740             text_only_kwargs = _chat_completions_kwargs(
    741                 model=model,
    742                 messages=fallback_messages,
    743                 response_format=None,
    744                 temperature=0.2,
    745                 reasoning_effort=reasoning_effort,
    746             )
    747             response = client.chat.completions.create(**text_only_kwargs)
    748             free_text = (response.choices[0].message.content or "").strip()
    749         except Exception:
    750             free_text = ""
    751         return SmartSummary(tldr=free_text or "Falha ao gerar sumario estruturado.")
    752 
    753 
    754 def _merge_smart_summaries(
    755     parciais: List[SmartSummary],
    756     model: str,
    757     content_type: Optional[str],
    758     *,
    759     reasoning_effort: Optional[str] = None,
    760 ) -> SmartSummary:
    761     client = _ensure_client()
    762     serialized = json.dumps([p.model_dump() for p in parciais], ensure_ascii=False, indent=2)
    763     messages = _build_smart_messages(serialized, content_type, "pt-BR", is_merge=True)
    764 
    765     try:
    766         kwargs = _chat_completions_kwargs(
    767             model=model,
    768             messages=messages,
    769             response_format={"type": "json_schema", "json_schema": SMART_SUMMARY_JSON_SCHEMA},
    770             temperature=0.2,
    771             reasoning_effort=reasoning_effort,
    772         )
    773         response = client.chat.completions.create(**kwargs)
    774         raw = response.choices[0].message.content or "{}"
    775         return _parse_smart_summary(raw)
    776     except Exception as exc:
    777         print(f"Aviso: merge estruturado falhou ({exc}); aplicando merge local.")
    778         return _local_merge_summaries(parciais)
    779 
    780 
    781 def _local_merge_summaries(parciais: List[SmartSummary]) -> SmartSummary:
    782     """Merge determinista local quando a API falha no merge final."""
    783     seen: Dict[str, set] = {
    784         "key_points": set(),
    785         "decisions": set(),
    786         "topics": set(),
    787         "quotes": set(),
    788         "open_questions": set(),
    789     }
    790     merged = SmartSummary()
    791     tldr_parts: List[str] = []
    792     actions_seen: set = set()
    793 
    794     for part in parciais:
    795         if part.tldr:
    796             tldr_parts.append(part.tldr)
    797         for field, target_set in seen.items():
    798             for value in getattr(part, field, []) or []:
    799                 if value and value not in target_set:
    800                     target_set.add(value)
    801                     getattr(merged, field).append(value)
    802         for action in part.action_items or []:
    803             key = (action.owner or "", action.task or "", action.due_hint or "")
    804             if key not in actions_seen and (action.task or action.owner):
    805                 actions_seen.add(key)
    806                 merged.action_items.append(action)
    807 
    808     merged.tldr = " ".join(tldr_parts).strip()[:600]
    809     return merged
    810 
    811 
    812 def _parse_smart_summary(raw: str) -> SmartSummary:
    813     data = json.loads(raw)
    814     return SmartSummary(**data)
    815 
    816 
    817 # ---------------------------------------------------------------------------
    818 # Render para texto Markdown (compat com exports atuais)
    819 # ---------------------------------------------------------------------------
    820 
    821 def format_smart_summary_as_text(summary: Dict[str, Any]) -> str:
    822     """Converte um dict SmartSummary em Markdown legivel."""
    823 
    824     if not summary:
    825         return ""
    826 
    827     lines: List[str] = []
    828     tldr = summary.get("tldr") or ""
    829     if tldr.strip():
    830         lines.append("**TL;DR**")
    831         lines.append("")
    832         lines.append(tldr.strip())
    833         lines.append("")
    834 
    835     def _bullet_block(title: str, items: List[Any]) -> None:
    836         if not items:
    837             return
    838         lines.append(f"**{title}**")
    839         lines.append("")
    840         for item in items:
    841             if isinstance(item, dict):
    842                 owner = item.get("owner") or "(sem responsavel)"
    843                 task = item.get("task") or ""
    844                 due = item.get("due_hint") or ""
    845                 rendered = f"- {owner}: {task}".rstrip()
    846                 if due:
    847                     rendered += f" (prazo: {due})"
    848                 lines.append(rendered)
    849             else:
    850                 value = str(item).strip()
    851                 if value:
    852                     lines.append(f"- {value}")
    853         lines.append("")
    854 
    855     _bullet_block("Pontos-chave", summary.get("key_points") or [])
    856     _bullet_block("Decisoes", summary.get("decisions") or [])
    857     _bullet_block("Acoes", summary.get("action_items") or [])
    858     _bullet_block("Topicos", summary.get("topics") or [])
    859     _bullet_block("Citacoes", summary.get("quotes") or [])
    860     _bullet_block("Perguntas em aberto", summary.get("open_questions") or [])
    861 
    862     return "\n".join(lines).strip()
    863 
    864 
    865 # ---------------------------------------------------------------------------
    866 # Helpers de alto nivel para fontes de texto
    867 # ---------------------------------------------------------------------------
    868 
    869 def summarize_text_file(file_path: str, model: Optional[str] = None, language: str = "pt-BR") -> str:
    870     content_data = extract_text_file_content(file_path)
    871     text = content_data.get("content", "")
    872     if not text or not text.strip():
    873         return "Arquivo vazio - nao e possivel gerar sumario."
    874     return summarize_text(text, model=model, language=language)
    875 
    876 
    877 def summarize_web_page(url: str, model: Optional[str] = None, language: str = "pt-BR") -> str:
    878     content_data = extract_web_content(url)
    879     text = content_data.get("content", "")
    880     if not text or not text.strip():
    881         return "Nao foi possivel extrair conteudo da pagina web."
    882     return summarize_text(text, model=model, language=language)
    883 
    884 
    885 def summarize_pdf(file_path: str, model: Optional[str] = None, language: str = "pt-BR") -> str:
    886     content_data = extract_pdf_content(file_path)
    887     text = content_data.get("content", "")
    888     if not text or not text.strip():
    889         return "PDF vazio ou nao foi possivel extrair texto."
    890     return summarize_text(text, model=model, language=language)
    891 
    892 
    893 # ---------------------------------------------------------------------------
    894 # Chunking utilitario
    895 # ---------------------------------------------------------------------------
    896 
    897 def _split_text_into_chunks(text: str, max_chars: int) -> List[str]:
    898     """Divide texto em chunks respeitando paragrafos, sentencas e o limite."""
    899 
    900     chunks: List[str] = []
    901     current_chunk = ""
    902 
    903     paragraphs = text.split("\n\n")
    904 
    905     for paragraph in paragraphs:
    906         if len(current_chunk) + len(paragraph) + 2 <= max_chars:
    907             current_chunk += paragraph + "\n\n"
    908             continue
    909 
    910         if current_chunk:
    911             chunks.append(current_chunk.strip())
    912             current_chunk = ""
    913 
    914         if len(paragraph) > max_chars:
    915             sentences = paragraph.split(". ")
    916             temp_chunk = ""
    917             for sentence in sentences:
    918                 if len(temp_chunk) + len(sentence) + 2 <= max_chars:
    919                     temp_chunk += sentence + ". "
    920                 else:
    921                     if temp_chunk:
    922                         chunks.append(temp_chunk.strip())
    923                     temp_chunk = sentence + ". "
    924             current_chunk = temp_chunk
    925         else:
    926             current_chunk = paragraph + "\n\n"
    927 
    928     if current_chunk:
    929         chunks.append(current_chunk.strip())
    930 
    931     return chunks