summarizer.py (34816B)
1 """ 2 Sumario e renderizacao em portugues usando os modelos de chat da OpenAI. 3 4 Inclui dois caminhos: 5 6 - `summarize_text` (legado) -> texto livre, mantido por compatibilidade. 7 - `summarize_smart` -> Structured Outputs com schema `SmartSummary` (TL;DR, 8 key points, decisoes, action items, topicos, citacoes, perguntas em aberto). 9 E o caminho preferido quando `OPENAI_ENABLE_SMART_SUMMARY=true`. 10 11 A `render_text_in_portuguese` continua igual, agora usando o modelo do 12 `get_model_config()` quando o caller nao especifica um. 13 """ 14 15 from __future__ import annotations 16 17 import json 18 import os 19 from concurrent.futures import ThreadPoolExecutor, as_completed 20 from typing import Any, Dict, List, Optional, Tuple 21 22 try: 23 from openai import OpenAI 24 except ImportError: # pragma: no cover - ambiente sem openai 25 OpenAI = None 26 try: 27 from dotenv import load_dotenv 28 except ImportError: # pragma: no cover - ambiente sem python-dotenv 29 def load_dotenv(): 30 return False 31 try: 32 from pydantic import BaseModel, Field 33 except ImportError: # pragma: no cover - pydantic indisponivel 34 BaseModel = None # type: ignore[misc,assignment] 35 36 def Field(*args, **kwargs): # type: ignore[misc] 37 return None 38 39 from .web.extractor import extract_pdf_content, extract_text_file_content, extract_web_content 40 from .core.config import VALID_REASONING_EFFORTS, get_model_config 41 42 load_dotenv() 43 44 45 # --------------------------------------------------------------------------- 46 # Schema do sumario inteligente 47 # --------------------------------------------------------------------------- 48 49 if BaseModel is not None: 50 51 class ActionItem(BaseModel): 52 owner: str = Field(default="", description="Responsavel ou pessoa indicada") 53 task: str = Field(default="", description="Acao a ser executada") 54 due_hint: str = Field(default="", description="Prazo, data ou referencia temporal") 55 56 57 class SmartSummary(BaseModel): 58 tldr: str = Field(default="", description="Sumario de 1-3 frases") 59 key_points: List[str] = Field(default_factory=list) 60 decisions: List[str] = Field(default_factory=list) 61 action_items: List[ActionItem] = Field(default_factory=list) 62 topics: List[str] = Field(default_factory=list) 63 quotes: List[str] = Field(default_factory=list) 64 open_questions: List[str] = Field(default_factory=list) 65 66 else: # pragma: no cover - apenas seguranca em ambientes sem pydantic 67 ActionItem = dict # type: ignore[misc,assignment] 68 SmartSummary = dict # type: ignore[misc,assignment] 69 70 71 # Schema explicito para `response_format=json_schema` (mais seguro que confiar 72 # em `model_json_schema()` para garantir strict-mode). 73 SMART_SUMMARY_JSON_SCHEMA: Dict[str, Any] = { 74 "name": "SmartSummary", 75 "strict": True, 76 "schema": { 77 "type": "object", 78 "additionalProperties": False, 79 "required": [ 80 "tldr", 81 "key_points", 82 "decisions", 83 "action_items", 84 "topics", 85 "quotes", 86 "open_questions", 87 ], 88 "properties": { 89 "tldr": {"type": "string"}, 90 "key_points": {"type": "array", "items": {"type": "string"}}, 91 "decisions": {"type": "array", "items": {"type": "string"}}, 92 "action_items": { 93 "type": "array", 94 "items": { 95 "type": "object", 96 "additionalProperties": False, 97 "required": ["owner", "task", "due_hint"], 98 "properties": { 99 "owner": {"type": "string"}, 100 "task": {"type": "string"}, 101 "due_hint": {"type": "string"}, 102 }, 103 }, 104 }, 105 "topics": {"type": "array", "items": {"type": "string"}}, 106 "quotes": {"type": "array", "items": {"type": "string"}}, 107 "open_questions": {"type": "array", "items": {"type": "string"}}, 108 }, 109 }, 110 } 111 112 113 # Personas/instrucoes complementares por tipo de conteudo. 114 PERSONA_HINTS: Dict[str, str] = { 115 "lecture": ( 116 "O texto e uma palestra ou aula. Enfatize conceitos-chave, definicoes, " 117 "exemplos e a sequencia logica de raciocinio. Em action_items, registre " 118 "exercicios ou leituras sugeridas." 119 ), 120 "podcast": ( 121 "O texto e um episodio de podcast. Destaque tese central, opinioes dos " 122 "convidados, exemplos e historias contadas. Em quotes, prefira trechos " 123 "marcantes; em action_items, recomendacoes praticas dadas ao ouvinte." 124 ), 125 "interview": ( 126 "O texto e uma entrevista. Estruture key_points por blocos tematicos das " 127 "perguntas, e em quotes registre as respostas mais reveladoras." 128 ), 129 "news": ( 130 "O texto e jornalistico. Foque em fatos verificaveis, numeros, fontes " 131 "citadas e linha do tempo. Em open_questions, deixe lacunas relatadas " 132 "pela propria materia." 133 ), 134 "tutorial": ( 135 "O texto e um tutorial ou how-to. Em key_points, liste passos na ordem; " 136 "em action_items, transforme cada passo em uma acao clara com pre-req." 137 ), 138 "meeting": ( 139 "O texto e uma reuniao. Seja muito explicito em decisions e action_items " 140 "(owner + task + due_hint). Em open_questions, registre pendencias." 141 ), 142 "tech_doc": ( 143 "O texto e um documento tecnico. Preserve nomes de APIs, parametros, " 144 "constantes e pre-requisitos em key_points." 145 ), 146 "other": "", 147 } 148 149 150 # Limite de chars por chamada (~tokens muito generoso). Reduz risco de hit no 151 # input_tokens limit em modelos com janela menor. 152 DEFAULT_CHUNK_CHAR_LIMIT = 300_000 153 154 155 # --------------------------------------------------------------------------- 156 # Infra OpenAI 157 # --------------------------------------------------------------------------- 158 159 def _ensure_client() -> "OpenAI": 160 api_key = os.getenv("OPENAI_API_KEY") 161 if not api_key: 162 raise Exception( 163 "OPENAI_API_KEY nao encontrada. " 164 "Configure a variavel de ambiente OPENAI_API_KEY ou crie um arquivo .env" 165 ) 166 if OpenAI is None: 167 raise Exception("openai nao esta instalado neste ambiente.") 168 return OpenAI(api_key=api_key) 169 170 171 def _wrap_chat_error(exc: Exception, action: str) -> Exception: 172 error_msg = str(exc) 173 lowered = error_msg.lower() 174 if "api_key" in lowered or "authentication" in lowered: 175 return Exception("Erro de autenticacao com OpenAI API. Verifique sua OPENAI_API_KEY.") 176 return Exception(f"Erro ao {action}: {error_msg}") 177 178 179 def _chat_completions_kwargs( 180 *, 181 model: str, 182 messages: List[Dict[str, Any]], 183 response_format: Optional[Dict[str, Any]] = None, 184 temperature: Optional[float] = None, 185 reasoning_effort: Optional[str] = None, 186 ) -> Dict[str, Any]: 187 """Monta kwargs respeitando peculiaridades por familia de modelo.""" 188 189 config = get_model_config() 190 kwargs: Dict[str, Any] = {"model": model, "messages": messages} 191 if response_format is not None: 192 kwargs["response_format"] = response_format 193 effort = (reasoning_effort if reasoning_effort is not None else config.reasoning_effort).lower() 194 if effort not in VALID_REASONING_EFFORTS: 195 effort = config.reasoning_effort 196 if config.supports_reasoning(model): 197 kwargs["reasoning_effort"] = effort 198 elif temperature is not None: 199 # Modelos que nao sao da familia gpt-5/o aceitam temperature normalmente. 200 kwargs["temperature"] = temperature 201 return kwargs 202 203 204 # --------------------------------------------------------------------------- 205 # Conversao para PT-BR (legacy mantido) 206 # --------------------------------------------------------------------------- 207 208 def render_text_in_portuguese( 209 text: str, 210 model: Optional[str] = None, 211 reasoning_effort: Optional[str] = None, 212 ) -> str: 213 """Converte qualquer texto para portugues do Brasil preservando detalhes.""" 214 215 if not text or not text.strip(): 216 return "" 217 218 config = get_model_config() 219 chosen_model = model or config.chat_model 220 221 max_chars = DEFAULT_CHUNK_CHAR_LIMIT 222 if len(text) <= max_chars: 223 return _render_portuguese_chunk(text, chosen_model, reasoning_effort=reasoning_effort) 224 225 chunks = _split_text_into_chunks(text, max_chars) 226 rendered_chunks: List[str] = [] 227 print(f"Texto longo detectado ({len(text)} caracteres). Convertendo {len(chunks)} partes para portugues...") 228 for i, chunk in enumerate(chunks): 229 print(f"Convertendo parte {i + 1}/{len(chunks)}...") 230 rendered_chunks.append(_render_portuguese_chunk(chunk, chosen_model, reasoning_effort=reasoning_effort)) 231 return "\n\n".join(chunk.strip() for chunk in rendered_chunks if chunk.strip()) 232 233 234 def _render_portuguese_chunk(text: str, model: str, reasoning_effort: Optional[str] = None) -> str: 235 client = _ensure_client() 236 prompt = ( 237 "Converta o texto a seguir para portugues do Brasil.\n\n" 238 "Regras:\n" 239 "- Se o texto ja estiver em portugues, mantenha em portugues do Brasil natural.\n" 240 "- Nao resuma, nao explique, nao comente o texto.\n" 241 "- Preserve nomes proprios, numeros, datas, links, listas e estrutura.\n" 242 "- Mantenha o maximo de fidelidade possivel ao conteudo original.\n" 243 "- Use acentuacao e ortografia corretas em portugues do Brasil.\n\n" 244 "Texto:\n\n" 245 ) 246 247 try: 248 kwargs = _chat_completions_kwargs( 249 model=model, 250 messages=[ 251 { 252 "role": "system", 253 "content": "Voce e um tradutor tecnico e editorial que preserva fielmente o conteudo.", 254 }, 255 {"role": "user", "content": prompt + text}, 256 ], 257 temperature=0.1, 258 reasoning_effort=reasoning_effort, 259 ) 260 response = client.chat.completions.create(**kwargs) 261 return (response.choices[0].message.content or "").strip() 262 except Exception as exc: 263 raise _wrap_chat_error(exc, "converter texto para portugues") from exc 264 265 266 POLISH_PT_BR_CHUNK_CHARS = 24_000 267 268 269 def polish_pt_br_text( 270 text: str, 271 kind: str = "transcription", 272 model: Optional[str] = None, 273 reasoning_effort: Optional[str] = None, 274 ) -> str: 275 """ 276 Revisao ortografica leve em pt-BR: acentos, grafia, paragrafos. 277 Nao altera factos, nomes proprios, numeros, links nem marcadores **Falante N**. 278 """ 279 280 if not text or not text.strip(): 281 return text or "" 282 283 config = get_model_config() 284 chosen_model = model or config.chat_model 285 max_chars = min(POLISH_PT_BR_CHUNK_CHARS, DEFAULT_CHUNK_CHAR_LIMIT) 286 287 kind_norm = (kind or "transcription").strip().lower() 288 if kind_norm == "summary": 289 kind_hint = ( 290 "Este trecho e um sumario ou sintese: mantenha tom conciso e bullet/numeracao se ja existir." 291 ) 292 else: 293 kind_hint = ( 294 "Este trecho e transcricao ou texto corrido: preserve a ordem e falas; " 295 "nao resuma nem omita frases." 296 ) 297 298 if len(text) <= max_chars: 299 return _polish_pt_br_chunk(text, chosen_model, kind_hint, reasoning_effort=reasoning_effort) 300 301 chunks = _split_text_into_chunks(text, max_chars) 302 out: List[str] = [] 303 for i, chunk in enumerate(chunks): 304 print(f"Revisao ortografica pt-BR: parte {i + 1}/{len(chunks)}...") 305 out.append(_polish_pt_br_chunk(chunk, chosen_model, kind_hint, reasoning_effort=reasoning_effort)) 306 return "\n\n".join(c.strip() for c in out if c.strip()) 307 308 309 def _polish_pt_br_chunk( 310 text: str, 311 model: str, 312 kind_hint: str, 313 reasoning_effort: Optional[str] = None, 314 ) -> str: 315 client = _ensure_client() 316 prompt = ( 317 "Revise o texto em portugues do Brasil.\n\n" 318 f"{kind_hint}\n\n" 319 "Regras obrigatorias:\n" 320 "- Corrija acentuacao e ortografia em pt-BR.\n" 321 "- Melhore quebras de linha e paragrafos quando ajudar a leitura.\n" 322 "- Nao altere factos, dados, numeros, datas, nomes proprios, codigos, URLs.\n" 323 "- Preserve linhas que comecam com **Falante N** e a estrutura logo abaixo delas.\n" 324 "- Nao adicione comentarios meta; devolva apenas o texto revisado.\n\n" 325 "Texto:\n\n" 326 ) 327 try: 328 kwargs = _chat_completions_kwargs( 329 model=model, 330 messages=[ 331 { 332 "role": "system", 333 "content": "Voce e um revisor editorial em portugues do Brasil, preciso e conservador.", 334 }, 335 {"role": "user", "content": prompt + text}, 336 ], 337 temperature=0.05, 338 reasoning_effort=reasoning_effort, 339 ) 340 response = client.chat.completions.create(**kwargs) 341 return (response.choices[0].message.content or "").strip() 342 except Exception as exc: 343 raise _wrap_chat_error(exc, "revisar ortografia em portugues") from exc 344 345 346 # --------------------------------------------------------------------------- 347 # Sumario textual legado 348 # --------------------------------------------------------------------------- 349 350 def summarize_text( 351 text: str, 352 model: Optional[str] = None, 353 language: str = "pt-BR", 354 reasoning_effort: Optional[str] = None, 355 ) -> str: 356 """Sumariza texto em formato livre (caminho legado).""" 357 358 if not text or not text.strip(): 359 return "Texto vazio - nao e possivel gerar sumario." 360 361 config = get_model_config() 362 chosen_model = model or config.chat_model 363 364 use_hier = config.hierarchical_summary and len(text) > config.summary_direct_max_chars 365 if use_hier: 366 from .core.long_summary import chunk_text_with_overlap 367 368 chunks = chunk_text_with_overlap( 369 text.strip(), 370 config.summary_map_chunk_chars, 371 config.summary_chunk_overlap_chars, 372 ) 373 if not chunks: 374 return "Texto vazio - nao e possivel gerar sumario." 375 if len(chunks) == 1: 376 return _summarize_chunk(chunks[0], chosen_model, language, reasoning_effort=reasoning_effort) 377 chunk_summaries: List[str] = [] 378 print( 379 f"Sumario legado hierarquico: {len(text)} caracteres -> {len(chunks)} chunks " 380 f"(overlap {config.summary_chunk_overlap_chars})." 381 ) 382 for i, chunk in enumerate(chunks): 383 try: 384 print(f"Sumarizando parte {i + 1}/{len(chunks)}...") 385 chunk_summaries.append( 386 _summarize_chunk(chunk, chosen_model, language, reasoning_effort=reasoning_effort) 387 ) 388 except Exception as exc: 389 print(f"Erro ao sumarizar parte {i + 1}: {exc}") 390 chunk_summaries.append(f"[Erro nesta parte: {exc}]") 391 392 valid_summaries = [s for s in chunk_summaries if not s.startswith("[Erro")] 393 if not valid_summaries: 394 return "Falha ao gerar sumario: todas as partes falharam." 395 combined = "\n\n".join(valid_summaries) 396 return _summarize_chunk( 397 combined, 398 chosen_model, 399 language, 400 is_final=True, 401 reasoning_effort=reasoning_effort, 402 ) 403 404 max_chars = 400_000 405 406 if len(text) <= max_chars: 407 return _summarize_chunk(text, chosen_model, language, reasoning_effort=reasoning_effort) 408 409 chunks = _split_text_into_chunks(text, max_chars) 410 chunk_summaries: List[str] = [] 411 print(f"Texto longo detectado ({len(text)} caracteres). Dividido em {len(chunks)} partes.") 412 for i, chunk in enumerate(chunks): 413 try: 414 print(f"Sumarizando parte {i + 1}/{len(chunks)}...") 415 chunk_summaries.append(_summarize_chunk(chunk, chosen_model, language, reasoning_effort=reasoning_effort)) 416 except Exception as exc: 417 print(f"Erro ao sumarizar parte {i + 1}: {exc}") 418 chunk_summaries.append(f"[Erro nesta parte: {exc}]") 419 420 valid_summaries = [s for s in chunk_summaries if not s.startswith("[Erro")] 421 if not valid_summaries: 422 return "Falha ao gerar sumario: todas as partes falharam." 423 if len(valid_summaries) > 1: 424 print("Consolidando sumarios parciais...") 425 combined = "\n\n".join(valid_summaries) 426 return _summarize_chunk( 427 combined, 428 chosen_model, 429 language, 430 is_final=True, 431 reasoning_effort=reasoning_effort, 432 ) 433 return valid_summaries[0] 434 435 436 def _summarize_chunk( 437 text: str, 438 model: str, 439 language: str, 440 is_final: bool = False, 441 reasoning_effort: Optional[str] = None, 442 ) -> str: 443 client = _ensure_client() 444 445 if language.startswith("pt"): 446 prompt = ( 447 "Voce e um assistente especializado em criar sumarios detalhados e completos.\n\n" 448 "Crie um sumario COMPLETO e DETALHADO do seguinte texto em portugues do Brasil. O sumario DEVE:\n" 449 "- Manter TODOS os pontos importantes, chave e informacoes relevantes\n" 450 "- Preservar numeros, datas, nomes, estatisticas e dados tecnicos\n" 451 "- Manter a estrutura logica e sequencia do conteudo original\n" 452 "- Ser detalhado o suficiente para nao perder informacoes essenciais\n" 453 "- Destacar os principais temas e subtemas\n" 454 "- Ser escrito em portugues do Brasil\n\n" 455 "Texto para sumarizar:\n\n" 456 ) 457 if is_final: 458 prompt = ( 459 "Voce recebeu multiplos sumarios parciais de um texto longo. Crie um sumario final " 460 "unificado e COMPLETO em portugues do Brasil que integre TODOS os pontos principais, " 461 "preserve numeros/datas/nomes e seja coerente.\n\nSumarios parciais:\n\n" 462 ) 463 else: 464 prompt = ( 465 f"You are an assistant specialized in creating concise and informative summaries.\n\n" 466 f"Please create a detailed summary of the following text in {language}.\n\nText:\n\n" 467 ) 468 469 try: 470 kwargs = _chat_completions_kwargs( 471 model=model, 472 messages=[ 473 { 474 "role": "system", 475 "content": "You are a helpful assistant that creates detailed and comprehensive summaries.", 476 }, 477 {"role": "user", "content": prompt + text}, 478 ], 479 temperature=0.2, 480 reasoning_effort=reasoning_effort, 481 ) 482 response = client.chat.completions.create(**kwargs) 483 return (response.choices[0].message.content or "").strip() 484 except Exception as exc: 485 raise _wrap_chat_error(exc, "sumarizar texto") from exc 486 487 488 # --------------------------------------------------------------------------- 489 # Sumario inteligente (Structured Outputs) 490 # --------------------------------------------------------------------------- 491 492 def summarize_smart( 493 text: str, 494 *, 495 model: Optional[str] = None, 496 content_type: Optional[str] = None, 497 language: str = "pt-BR", 498 reasoning_effort: Optional[str] = None, 499 ) -> Dict[str, Any]: 500 """Gera sumario estruturado e devolve dicionario serializavel. 501 502 Sempre devolve um dict no formato `SmartSummary` (mesmo em fallback). Se a 503 chamada com structured outputs falhar, faz uma tentativa adicional com 504 `json_object`. Se ainda assim falhar, embrulha o texto livre em `tldr`. 505 """ 506 507 if BaseModel is None: 508 raise Exception("pydantic nao esta instalado neste ambiente.") 509 510 if not text or not text.strip(): 511 return SmartSummary(tldr="Texto vazio - nao e possivel gerar sumario.").model_dump() 512 513 config = get_model_config() 514 chosen_model = model or config.chat_model 515 516 use_hier = config.hierarchical_summary and len(text) > config.summary_direct_max_chars 517 if use_hier: 518 from .core.long_summary import chunk_text_with_overlap 519 520 chunks = chunk_text_with_overlap( 521 text.strip(), 522 config.summary_map_chunk_chars, 523 config.summary_chunk_overlap_chars, 524 ) 525 if not chunks: 526 return SmartSummary(tldr="Texto vazio - nao e possivel gerar sumario.").model_dump() 527 print( 528 f"Sumario inteligente hierarquico: {len(text)} chars -> {len(chunks)} chunks " 529 f"(overlap {config.summary_chunk_overlap_chars})." 530 ) 531 if len(chunks) == 1: 532 return _summarize_smart_chunk( 533 chunks[0], 534 chosen_model, 535 content_type, 536 language, 537 reasoning_effort=reasoning_effort, 538 ).model_dump() 539 parciais_h = _collect_smart_partials( 540 chunks, 541 chosen_model, 542 content_type, 543 language, 544 reasoning_effort, 545 label="Sumario inteligente hierarquico", 546 ) 547 if not parciais_h: 548 return SmartSummary(tldr="Falha ao gerar sumario: todas as partes falharam.").model_dump() 549 if len(parciais_h) == 1: 550 return parciais_h[0].model_dump() 551 return _merge_smart_summaries( 552 parciais_h, 553 chosen_model, 554 content_type, 555 reasoning_effort=reasoning_effort, 556 ).model_dump() 557 558 chunks = _split_text_into_chunks(text, DEFAULT_CHUNK_CHAR_LIMIT) if len(text) > DEFAULT_CHUNK_CHAR_LIMIT else [text] 559 560 if len(chunks) == 1: 561 return _summarize_smart_chunk( 562 chunks[0], 563 chosen_model, 564 content_type, 565 language, 566 reasoning_effort=reasoning_effort, 567 ).model_dump() 568 569 print(f"Texto longo detectado ({len(text)} chars) - rodando sumario inteligente em {len(chunks)} partes.") 570 parciais = _collect_smart_partials( 571 chunks, 572 chosen_model, 573 content_type, 574 language, 575 reasoning_effort, 576 label="Sumario inteligente", 577 ) 578 579 if not parciais: 580 return SmartSummary(tldr="Falha ao gerar sumario: todas as partes falharam.").model_dump() 581 582 if len(parciais) == 1: 583 return parciais[0].model_dump() 584 585 return _merge_smart_summaries( 586 parciais, 587 chosen_model, 588 content_type, 589 reasoning_effort=reasoning_effort, 590 ).model_dump() 591 592 593 def _build_smart_messages( 594 text: str, 595 content_type: Optional[str], 596 language: str, 597 is_merge: bool = False, 598 ) -> List[Dict[str, Any]]: 599 persona_extra = PERSONA_HINTS.get((content_type or "other").lower(), "") 600 601 system = ( 602 "Voce e um analista que cria sumarios estruturados de altissima qualidade em " 603 f"{'portugues do Brasil' if language.startswith('pt') else language}. " 604 "Mantenha fidelidade ao texto, sem inventar dados. Quando um campo nao tiver " 605 "informacao, devolva uma lista vazia ou string vazia." 606 ) 607 if persona_extra: 608 system += " " + persona_extra 609 610 if is_merge: 611 user = ( 612 "Voce recebera varios sumarios estruturados parciais de um mesmo texto longo. " 613 "Consolide-os em UM unico SmartSummary final, eliminando duplicatas, mantendo " 614 "todos os pontos relevantes e preservando numeros, datas e nomes.\n\n" 615 "Sumarios parciais (JSON):\n\n" + text 616 ) 617 else: 618 user = ( 619 "Crie um SmartSummary do texto abaixo. Regras:\n" 620 "- tldr: 1-3 frases.\n" 621 "- key_points: pontos principais em ordem logica.\n" 622 "- decisions: decisoes explicitas tomadas no texto (vazio se nao houver).\n" 623 "- action_items: tarefas concretas com owner/task/due_hint quando possivel.\n" 624 "- topics: temas curtos (2-4 palavras cada).\n" 625 "- quotes: trechos literais marcantes.\n" 626 "- open_questions: perguntas em aberto ou pontos nao resolvidos.\n\n" 627 "Texto:\n\n" + text 628 ) 629 630 return [ 631 {"role": "system", "content": system}, 632 {"role": "user", "content": user}, 633 ] 634 635 636 def _summarize_smart_chunk( 637 text: str, 638 model: str, 639 content_type: Optional[str], 640 language: str, 641 *, 642 reasoning_effort: Optional[str] = None, 643 ) -> SmartSummary: 644 client = _ensure_client() 645 messages = _build_smart_messages(text, content_type, language, is_merge=False) 646 647 try: 648 kwargs = _chat_completions_kwargs( 649 model=model, 650 messages=messages, 651 response_format={"type": "json_schema", "json_schema": SMART_SUMMARY_JSON_SCHEMA}, 652 temperature=0.2, 653 reasoning_effort=reasoning_effort, 654 ) 655 response = client.chat.completions.create(**kwargs) 656 raw = response.choices[0].message.content or "{}" 657 return _parse_smart_summary(raw) 658 except Exception as exc: 659 # Fallback: pede JSON livre e tenta parsear. 660 print(f"Aviso: structured outputs falhou ({exc}); tentando json_object...") 661 return _summarize_smart_fallback_json(client, model, messages, reasoning_effort=reasoning_effort) 662 663 664 def _collect_smart_partials( 665 chunks: List[str], 666 chosen_model: str, 667 content_type: Optional[str], 668 language: str, 669 reasoning_effort: Optional[str], 670 *, 671 label: str = "Sumario inteligente", 672 ) -> List[SmartSummary]: 673 """Map de chunks para SmartSummary; paralelo quando LAZIER_SUMMARY_PARALLEL_WORKERS > 1.""" 674 config = get_model_config() 675 total = len(chunks) 676 workers = min(config.summary_parallel_workers, total) if total > 1 else 1 677 678 def _one(index: int, chunk: str) -> Tuple[int, Optional[SmartSummary]]: 679 print(f"{label} parte {index + 1}/{total}...") 680 try: 681 return index, _summarize_smart_chunk( 682 chunk, 683 chosen_model, 684 content_type, 685 language, 686 reasoning_effort=reasoning_effort, 687 ) 688 except Exception as exc: 689 print(f"Erro ao sumarizar parte {index + 1}: {exc}") 690 return index, None 691 692 if workers <= 1: 693 ordered: List[SmartSummary] = [] 694 for index, chunk in enumerate(chunks): 695 _, partial = _one(index, chunk) 696 if partial: 697 ordered.append(partial) 698 return ordered 699 700 slots: List[Optional[SmartSummary]] = [None] * total 701 with ThreadPoolExecutor(max_workers=workers) as pool: 702 futures = [pool.submit(_one, index, chunk) for index, chunk in enumerate(chunks)] 703 for future in as_completed(futures): 704 idx, partial = future.result() 705 if partial: 706 slots[idx] = partial 707 return [item for item in slots if item is not None] 708 709 710 def _summarize_smart_fallback_json( 711 client: "OpenAI", 712 model: str, 713 messages: List[Dict[str, Any]], 714 *, 715 reasoning_effort: Optional[str] = None, 716 ) -> SmartSummary: 717 fallback_messages = list(messages) 718 fallback_messages[0] = { 719 **messages[0], 720 "content": messages[0]["content"] 721 + " Devolva apenas um JSON valido com as chaves: tldr, key_points, decisions, " 722 "action_items (lista de objetos com owner/task/due_hint), topics, quotes, open_questions.", 723 } 724 725 try: 726 kwargs = _chat_completions_kwargs( 727 model=model, 728 messages=fallback_messages, 729 response_format={"type": "json_object"}, 730 temperature=0.2, 731 reasoning_effort=reasoning_effort, 732 ) 733 response = client.chat.completions.create(**kwargs) 734 raw = response.choices[0].message.content or "{}" 735 return _parse_smart_summary(raw) 736 except Exception as exc: 737 print(f"Aviso: fallback JSON tambem falhou ({exc}); embrulhando texto livre.") 738 # Ultimo recurso: roda o sumario textual e enfia tudo em tldr. 739 try: 740 text_only_kwargs = _chat_completions_kwargs( 741 model=model, 742 messages=fallback_messages, 743 response_format=None, 744 temperature=0.2, 745 reasoning_effort=reasoning_effort, 746 ) 747 response = client.chat.completions.create(**text_only_kwargs) 748 free_text = (response.choices[0].message.content or "").strip() 749 except Exception: 750 free_text = "" 751 return SmartSummary(tldr=free_text or "Falha ao gerar sumario estruturado.") 752 753 754 def _merge_smart_summaries( 755 parciais: List[SmartSummary], 756 model: str, 757 content_type: Optional[str], 758 *, 759 reasoning_effort: Optional[str] = None, 760 ) -> SmartSummary: 761 client = _ensure_client() 762 serialized = json.dumps([p.model_dump() for p in parciais], ensure_ascii=False, indent=2) 763 messages = _build_smart_messages(serialized, content_type, "pt-BR", is_merge=True) 764 765 try: 766 kwargs = _chat_completions_kwargs( 767 model=model, 768 messages=messages, 769 response_format={"type": "json_schema", "json_schema": SMART_SUMMARY_JSON_SCHEMA}, 770 temperature=0.2, 771 reasoning_effort=reasoning_effort, 772 ) 773 response = client.chat.completions.create(**kwargs) 774 raw = response.choices[0].message.content or "{}" 775 return _parse_smart_summary(raw) 776 except Exception as exc: 777 print(f"Aviso: merge estruturado falhou ({exc}); aplicando merge local.") 778 return _local_merge_summaries(parciais) 779 780 781 def _local_merge_summaries(parciais: List[SmartSummary]) -> SmartSummary: 782 """Merge determinista local quando a API falha no merge final.""" 783 seen: Dict[str, set] = { 784 "key_points": set(), 785 "decisions": set(), 786 "topics": set(), 787 "quotes": set(), 788 "open_questions": set(), 789 } 790 merged = SmartSummary() 791 tldr_parts: List[str] = [] 792 actions_seen: set = set() 793 794 for part in parciais: 795 if part.tldr: 796 tldr_parts.append(part.tldr) 797 for field, target_set in seen.items(): 798 for value in getattr(part, field, []) or []: 799 if value and value not in target_set: 800 target_set.add(value) 801 getattr(merged, field).append(value) 802 for action in part.action_items or []: 803 key = (action.owner or "", action.task or "", action.due_hint or "") 804 if key not in actions_seen and (action.task or action.owner): 805 actions_seen.add(key) 806 merged.action_items.append(action) 807 808 merged.tldr = " ".join(tldr_parts).strip()[:600] 809 return merged 810 811 812 def _parse_smart_summary(raw: str) -> SmartSummary: 813 data = json.loads(raw) 814 return SmartSummary(**data) 815 816 817 # --------------------------------------------------------------------------- 818 # Render para texto Markdown (compat com exports atuais) 819 # --------------------------------------------------------------------------- 820 821 def format_smart_summary_as_text(summary: Dict[str, Any]) -> str: 822 """Converte um dict SmartSummary em Markdown legivel.""" 823 824 if not summary: 825 return "" 826 827 lines: List[str] = [] 828 tldr = summary.get("tldr") or "" 829 if tldr.strip(): 830 lines.append("**TL;DR**") 831 lines.append("") 832 lines.append(tldr.strip()) 833 lines.append("") 834 835 def _bullet_block(title: str, items: List[Any]) -> None: 836 if not items: 837 return 838 lines.append(f"**{title}**") 839 lines.append("") 840 for item in items: 841 if isinstance(item, dict): 842 owner = item.get("owner") or "(sem responsavel)" 843 task = item.get("task") or "" 844 due = item.get("due_hint") or "" 845 rendered = f"- {owner}: {task}".rstrip() 846 if due: 847 rendered += f" (prazo: {due})" 848 lines.append(rendered) 849 else: 850 value = str(item).strip() 851 if value: 852 lines.append(f"- {value}") 853 lines.append("") 854 855 _bullet_block("Pontos-chave", summary.get("key_points") or []) 856 _bullet_block("Decisoes", summary.get("decisions") or []) 857 _bullet_block("Acoes", summary.get("action_items") or []) 858 _bullet_block("Topicos", summary.get("topics") or []) 859 _bullet_block("Citacoes", summary.get("quotes") or []) 860 _bullet_block("Perguntas em aberto", summary.get("open_questions") or []) 861 862 return "\n".join(lines).strip() 863 864 865 # --------------------------------------------------------------------------- 866 # Helpers de alto nivel para fontes de texto 867 # --------------------------------------------------------------------------- 868 869 def summarize_text_file(file_path: str, model: Optional[str] = None, language: str = "pt-BR") -> str: 870 content_data = extract_text_file_content(file_path) 871 text = content_data.get("content", "") 872 if not text or not text.strip(): 873 return "Arquivo vazio - nao e possivel gerar sumario." 874 return summarize_text(text, model=model, language=language) 875 876 877 def summarize_web_page(url: str, model: Optional[str] = None, language: str = "pt-BR") -> str: 878 content_data = extract_web_content(url) 879 text = content_data.get("content", "") 880 if not text or not text.strip(): 881 return "Nao foi possivel extrair conteudo da pagina web." 882 return summarize_text(text, model=model, language=language) 883 884 885 def summarize_pdf(file_path: str, model: Optional[str] = None, language: str = "pt-BR") -> str: 886 content_data = extract_pdf_content(file_path) 887 text = content_data.get("content", "") 888 if not text or not text.strip(): 889 return "PDF vazio ou nao foi possivel extrair texto." 890 return summarize_text(text, model=model, language=language) 891 892 893 # --------------------------------------------------------------------------- 894 # Chunking utilitario 895 # --------------------------------------------------------------------------- 896 897 def _split_text_into_chunks(text: str, max_chars: int) -> List[str]: 898 """Divide texto em chunks respeitando paragrafos, sentencas e o limite.""" 899 900 chunks: List[str] = [] 901 current_chunk = "" 902 903 paragraphs = text.split("\n\n") 904 905 for paragraph in paragraphs: 906 if len(current_chunk) + len(paragraph) + 2 <= max_chars: 907 current_chunk += paragraph + "\n\n" 908 continue 909 910 if current_chunk: 911 chunks.append(current_chunk.strip()) 912 current_chunk = "" 913 914 if len(paragraph) > max_chars: 915 sentences = paragraph.split(". ") 916 temp_chunk = "" 917 for sentence in sentences: 918 if len(temp_chunk) + len(sentence) + 2 <= max_chars: 919 temp_chunk += sentence + ". " 920 else: 921 if temp_chunk: 922 chunks.append(temp_chunk.strip()) 923 temp_chunk = sentence + ". " 924 current_chunk = temp_chunk 925 else: 926 current_chunk = paragraph + "\n\n" 927 928 if current_chunk: 929 chunks.append(current_chunk.strip()) 930 931 return chunks