publish_news.py (18493B)
1 #!/usr/bin/env python3 2 """ 3 Lê ficheiros ``*.md`` e ``*.txt`` nesta pasta (``site/news/``), gera entradas em 4 ``site/public/news/data/news.json``, ``site/public/news/feed.rss`` e 5 actualiza ``lastmod`` da entrada ``/news/`` em ``site/public/sitemap.xml``. 6 7 Formato de cada ficheiro: 8 - Linha 1: título 9 - Linhas seguintes: corpo 10 - ``.md`` usa Markdown básico seguro 11 - ``.txt`` vira texto simples com parágrafos e quebras de linha preservadas 12 13 Os ficheiros processados são **apagados**. Ficheiros cujo nome começa por ``_`` são ignorados 14 (ex.: ``_exemplo.md`` para documentação). 15 16 Não versionar notícias no HTML: os dados ficam em ``news.json`` (tipicamente ignorado pelo git 17 no servidor após gerar conteúdo local). 18 19 Após publicar (sem ``--dry-run``), tenta ``site/genlanding.py --sync-public-only`` quando o 20 DocumentRoot da landing existir (por omissão ``/var/www/runv.club/html``), para copiar 21 ``site/public/`` para o Apache. Em produção use ``sudo``. ``--skip-genlanding`` omite esse passo. 22 23 Uso:: 24 sudo python3 site/news/publish_news.py [--dry-run] [--verbose] [--skip-genlanding] 25 """ 26 27 from __future__ import annotations 28 29 import argparse 30 import html 31 import json 32 import re 33 import subprocess 34 import sys 35 import uuid 36 from xml.sax.saxutils import escape as xml_escape 37 from datetime import datetime, timedelta, timezone 38 from pathlib import Path 39 from typing import Any, Final 40 from zoneinfo import ZoneInfo 41 42 SCRIPT_DIR = Path(__file__).resolve().parent 43 REPO_SITE = SCRIPT_DIR.parent 44 _REPO_ROOT = REPO_SITE.parent 45 _ADMIN_DIR = _REPO_ROOT / "scripts" / "admin" 46 if str(_ADMIN_DIR) not in sys.path: 47 sys.path.insert(0, str(_ADMIN_DIR)) 48 49 from admin_guard import ensure_admin_cli 50 51 PUBLIC_NEWS = REPO_SITE / "public" / "news" 52 DATA_DIR = PUBLIC_NEWS / "data" 53 JSON_PATH = DATA_DIR / "news.json" 54 RSS_PATH = PUBLIC_NEWS / "feed.rss" 55 SITEMAP_PATH = REPO_SITE / "public" / "sitemap.xml" 56 57 TZ_BR: Final[str] = "America/Sao_Paulo" 58 # Brasil sem DST: fallback se ``tzdata`` não estiver instalado (ex.: Windows minimal). 59 BR_FALLBACK_TZ = timezone(timedelta(hours=-3)) 60 SITE_URL: Final[str] = "https://runv.club" 61 DEFAULT_LANDING_DOCUMENT_ROOT: Final[Path] = Path("/var/www/runv.club/html") 62 DEFAULT_MEMBERS_USERS_JSON: Final[Path] = Path("/var/lib/runv/users.json") 63 SUPPORTED_NEWS_SUFFIXES: Final[tuple[str, ...]] = (".md", ".txt") 64 _CODE_PLACEHOLDER_RE: Final[re.Pattern[str]] = re.compile(r"\x00CODE(\d+)\x00") 65 _LINK_RE: Final[re.Pattern[str]] = re.compile(r"\[([^\]\n]+)\]\(([^)\s]+)\)") 66 _BOLD_RE: Final[re.Pattern[str]] = re.compile(r"(?<!\*)\*\*([^\n*][\s\S]*?[^\n*])\*\*(?!\*)") 67 _UNDERLINE_RE: Final[re.Pattern[str]] = re.compile(r"\+\+([^\n+][\s\S]*?[^\n+])\+\+") 68 _ITALIC_STAR_RE: Final[re.Pattern[str]] = re.compile(r"(?<!\*)\*([^\s*][^*\n]*?[^\s*])\*(?!\*)") 69 _ITALIC_UNDERSCORE_RE: Final[re.Pattern[str]] = re.compile(r"(?<!_)_([^\s_][^_\n]*?[^\s_])_(?!_)") 70 _INLINE_CODE_RE: Final[re.Pattern[str]] = re.compile(r"`([^`\n]+)`") 71 _SAFE_URL_RE: Final[re.Pattern[str]] = re.compile( 72 r"^(?:https?://|mailto:|/|#|\.{1,2}/)[^\s]*$", 73 re.IGNORECASE, 74 ) 75 76 77 def sync_landing_after_news( 78 *, 79 document_root: Path, 80 members_users_json: Path, 81 members_homes_root: Path | None, 82 verbose: bool, 83 ) -> int: 84 """ 85 Copia site/public → DocumentRoot via genlanding --sync-public-only. 86 Devolve 0 se omitido (sem script / sem DocumentRoot) ou sync OK; 1 se genlanding falhou. 87 """ 88 gl = _REPO_ROOT / "site" / "genlanding.py" 89 if not gl.is_file(): 90 print( 91 f"AVISO: genlanding.py não encontrado em {gl}; não sincronizou DocumentRoot.", 92 file=sys.stderr, 93 ) 94 return 0 95 root = document_root.resolve() 96 if not root.is_dir(): 97 homes_opt = "" 98 if members_homes_root is not None: 99 homes_opt = f" --members-homes-root {members_homes_root.resolve()}" 100 print( 101 f"AVISO: DocumentRoot da landing inexistente ({root}) — site/public não foi copiado para Apache.\n" 102 f"Manual: sudo python3 {_REPO_ROOT / 'site' / 'genlanding.py'} --sync-public-only " 103 f"--document-root {root} --members-users-json {members_users_json}{homes_opt}", 104 file=sys.stderr, 105 ) 106 return 0 107 admin = _REPO_ROOT / "scripts" / "admin" 108 if str(admin) not in sys.path: 109 sys.path.insert(0, str(admin)) 110 from runv_landing_sync import genlanding_sync_command 111 112 cmd = genlanding_sync_command( 113 document_root=root, 114 users_json=members_users_json.resolve(), 115 homes_root=members_homes_root.resolve() if members_homes_root else None, 116 ) 117 if verbose: 118 print(f" $ {' '.join(cmd)}") 119 r = subprocess.run(cmd, capture_output=True, text=True, timeout=300) 120 if r.returncode == 0: 121 print(f"Landing sincronizada (public + members) → {root}") 122 return 0 123 combined = ((r.stdout or "") + "\n" + (r.stderr or "")).strip() 124 print( 125 f"Erro: genlanding --sync-public-only terminou com código {r.returncode}.", 126 file=sys.stderr, 127 ) 128 if combined: 129 print(combined[:4000], file=sys.stderr) 130 return 1 131 132 133 def _preserve_code_span(html_text: str, code_segments: list[str]) -> str: 134 def repl(match: re.Match[str]) -> str: 135 idx = len(code_segments) 136 code_segments.append(f"<code>{html.escape(match.group(1))}</code>") 137 return f"\x00CODE{idx}\x00" 138 139 return _INLINE_CODE_RE.sub(repl, html_text) 140 141 142 def _restore_code_span(html_text: str, code_segments: list[str]) -> str: 143 return _CODE_PLACEHOLDER_RE.sub( 144 lambda m: code_segments[int(m.group(1))], 145 html_text, 146 ) 147 148 149 def _safe_href(url: str) -> str | None: 150 if not _SAFE_URL_RE.fullmatch(url): 151 return None 152 if url.lower().startswith("javascript:"): 153 return None 154 return html.escape(url, quote=True) 155 156 157 def inline_markdown_to_html(text: str) -> str: 158 escaped = html.escape(text) 159 code_segments: list[str] = [] 160 escaped = _preserve_code_span(escaped, code_segments) 161 162 def repl_link(match: re.Match[str]) -> str: 163 label = inline_markdown_to_html(match.group(1)) 164 href = _safe_href(match.group(2).strip()) 165 if href is None: 166 return html.escape(match.group(0)) 167 return f'<a href="{href}">{label}</a>' 168 169 escaped = _LINK_RE.sub(repl_link, escaped) 170 escaped = _BOLD_RE.sub(lambda m: f"<strong>{m.group(1)}</strong>", escaped) 171 escaped = _UNDERLINE_RE.sub(lambda m: f"<u>{m.group(1)}</u>", escaped) 172 escaped = _ITALIC_STAR_RE.sub(lambda m: f"<em>{m.group(1)}</em>", escaped) 173 escaped = _ITALIC_UNDERSCORE_RE.sub(lambda m: f"<em>{m.group(1)}</em>", escaped) 174 return _restore_code_span(escaped, code_segments) 175 176 177 def render_plain_text_html(body: str) -> str: 178 body = body.replace("\r\n", "\n").strip() 179 if not body: 180 return "" 181 blocks = re.split(r"\n\s*\n+", body) 182 parts: list[str] = [] 183 for block in blocks: 184 lines = [html.escape(line.rstrip()) for line in block.split("\n")] 185 parts.append(f"<p>{'<br>\n'.join(lines)}</p>") 186 return "\n".join(parts) 187 188 189 def render_markdown_html(body: str) -> str: 190 body = body.replace("\r\n", "\n").strip() 191 if not body: 192 return "" 193 194 lines = body.split("\n") 195 parts: list[str] = [] 196 paragraph_lines: list[str] = [] 197 list_type: str | None = None 198 list_items: list[str] = [] 199 quote_lines: list[str] = [] 200 fence_lang = "" 201 code_lines: list[str] = [] 202 203 def flush_paragraph() -> None: 204 nonlocal paragraph_lines 205 if not paragraph_lines: 206 return 207 text = " ".join(line.strip() for line in paragraph_lines if line.strip()) 208 parts.append(f"<p>{inline_markdown_to_html(text)}</p>") 209 paragraph_lines = [] 210 211 def flush_list() -> None: 212 nonlocal list_type, list_items 213 if not list_items or not list_type: 214 return 215 tag = "ol" if list_type == "ol" else "ul" 216 items = "".join(f"<li>{inline_markdown_to_html(item)}</li>" for item in list_items) 217 parts.append(f"<{tag}>{items}</{tag}>") 218 list_type = None 219 list_items = [] 220 221 def flush_quote() -> None: 222 nonlocal quote_lines 223 if not quote_lines: 224 return 225 quote_html = render_markdown_html("\n".join(quote_lines)) 226 parts.append(f"<blockquote>{quote_html}</blockquote>") 227 quote_lines = [] 228 229 def flush_code() -> None: 230 nonlocal code_lines, fence_lang 231 code = "\n".join(code_lines) 232 lang_attr = "" 233 if fence_lang: 234 lang_attr = f' class="language-{html.escape(fence_lang, quote=True)}"' 235 parts.append(f"<pre><code{lang_attr}>{html.escape(code)}</code></pre>") 236 code_lines = [] 237 fence_lang = "" 238 239 def flush_all() -> None: 240 flush_paragraph() 241 flush_list() 242 flush_quote() 243 244 in_code = False 245 for raw_line in lines: 246 line = raw_line.rstrip() 247 stripped = line.strip() 248 249 if in_code: 250 if stripped.startswith("```"): 251 flush_code() 252 in_code = False 253 else: 254 code_lines.append(raw_line) 255 continue 256 257 if stripped.startswith("```"): 258 flush_all() 259 in_code = True 260 fence_lang = stripped[3:].strip() 261 code_lines = [] 262 continue 263 264 if not stripped: 265 flush_all() 266 continue 267 268 quote_match = re.match(r"^\s*>\s?(.*)$", line) 269 if quote_match: 270 flush_paragraph() 271 flush_list() 272 quote_lines.append(quote_match.group(1)) 273 continue 274 flush_quote() 275 276 heading_match = re.match(r"^(#{1,6})\s+(.+?)\s*$", stripped) 277 if heading_match: 278 flush_all() 279 level = len(heading_match.group(1)) 280 text = inline_markdown_to_html(heading_match.group(2)) 281 parts.append(f"<h{level}>{text}</h{level}>") 282 continue 283 284 if re.fullmatch(r"(?:-{3,}|\*{3,}|_{3,})", stripped): 285 flush_all() 286 parts.append("<hr>") 287 continue 288 289 ul_match = re.match(r"^\s*[-*+]\s+(.+)$", line) 290 if ul_match: 291 flush_paragraph() 292 if list_type not in (None, "ul"): 293 flush_list() 294 list_type = "ul" 295 list_items.append(ul_match.group(1).strip()) 296 continue 297 298 ol_match = re.match(r"^\s*\d+\.\s+(.+)$", line) 299 if ol_match: 300 flush_paragraph() 301 if list_type not in (None, "ol"): 302 flush_list() 303 list_type = "ol" 304 list_items.append(ol_match.group(1).strip()) 305 continue 306 307 flush_list() 308 paragraph_lines.append(line) 309 310 if in_code: 311 flush_code() 312 flush_all() 313 return "\n".join(parts) 314 315 316 def render_body_html(body: str, *, source_kind: str) -> str: 317 if source_kind == "txt": 318 return render_plain_text_html(body) 319 return render_markdown_html(body) 320 321 322 def parse_news_file(path: Path) -> tuple[str, str, str]: 323 raw = path.read_text(encoding="utf-8") 324 lines = raw.splitlines() 325 if not lines: 326 raise ValueError(f"{path.name}: ficheiro vazio") 327 title_line = lines[0].strip() 328 title = re.sub(r"^#\s+", "", title_line).strip() if path.suffix.lower() == ".md" else title_line 329 if not title: 330 raise ValueError(f"{path.name}: primeira linha (título) vazia") 331 body = "\n".join(lines[1:]).lstrip("\n") 332 return title, body, path.suffix.lower().lstrip(".") 333 334 335 def load_articles() -> list[dict[str, Any]]: 336 if not JSON_PATH.is_file(): 337 return [] 338 data = json.loads(JSON_PATH.read_text(encoding="utf-8")) 339 arts = data.get("articles") 340 if not isinstance(arts, list): 341 return [] 342 return arts 343 344 345 def save_articles(articles: list[dict[str, Any]]) -> None: 346 DATA_DIR.mkdir(parents=True, exist_ok=True) 347 JSON_PATH.write_text( 348 json.dumps({"articles": articles}, ensure_ascii=False, indent=2) + "\n", 349 encoding="utf-8", 350 ) 351 352 353 def br_date_display(now: datetime) -> str: 354 return now.strftime("%d-%m-%Y") 355 356 357 def rfc822_date(now: datetime) -> str: 358 """RFC 822 / RSS pubDate (locale inglês para dia da semana).""" 359 return now.strftime("%a, %d %b %Y %H:%M:%S %z") 360 361 362 def w3c_date(now: datetime) -> str: 363 if now.tzinfo is None: 364 now = now.replace(tzinfo=timezone.utc) 365 return now.isoformat(timespec="seconds") 366 367 368 def build_rss(articles: list[dict[str, Any]], now: datetime) -> str: 369 """RSS 2.0; descriptions em CDATA com HTML seguro gerado pelo script.""" 370 channel_parts = [ 371 '<?xml version="1.0" encoding="UTF-8"?>', 372 '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">', 373 "<channel>", 374 f"<title>Notícias — runv.club</title>", 375 f"<link>{SITE_URL}/news/</link>", 376 "<description>Comunicados e atualizações da comunidade runv.club</description>", 377 f"<language>pt-BR</language>", 378 f"<lastBuildDate>{rfc822_date(now)}</lastBuildDate>", 379 f'<atom:link href="{SITE_URL}/news/feed.rss" rel="self" type="application/rss+xml"/>', 380 ] 381 for art in articles[:50]: 382 title = xml_escape(str(art["title"])) 383 aid = xml_escape(str(art["id"])) 384 link = f"{SITE_URL}/news/#{aid}" 385 pub = art.get("pub_rfc822") or rfc822_date(now) 386 body = art.get("body_html") or "" 387 desc = f"<![CDATA[{body}]]>" 388 channel_parts.extend( 389 [ 390 "<item>", 391 f"<title>{title}</title>", 392 f"<link>{link}</link>", 393 f"<guid isPermaLink=\"false\">{SITE_URL}/news/item-{aid}</guid>", 394 f"<pubDate>{pub}</pubDate>", 395 f"<description>{desc}</description>", 396 "</item>", 397 ] 398 ) 399 channel_parts.extend(["</channel>", "</rss>"]) 400 return "\n".join(channel_parts) + "\n" 401 402 403 def update_sitemap_lastmod(news_lastmod: str) -> None: 404 """Actualiza ou insere ``<lastmod>`` só no URL ``/news/``, sem reescrever prefixos XML.""" 405 if not SITEMAP_PATH.is_file(): 406 return 407 text = SITEMAP_PATH.read_text(encoding="utf-8") 408 news_loc = f"<loc>{SITE_URL}/news/</loc>" 409 if news_loc not in text: 410 return 411 lastmod_tag = f"<lastmod>{news_lastmod}</lastmod>" 412 block_re = re.compile( 413 rf"(\s*<url>\s*{re.escape(news_loc)})(\s*<lastmod>[^<]*</lastmod>)?(\s*</url>)", 414 re.DOTALL, 415 ) 416 417 def repl(m: re.Match[str]) -> str: 418 return f"{m.group(1)}\n {lastmod_tag}{m.group(3)}" 419 420 new_text, n = block_re.subn(repl, text, count=1) 421 if n: 422 SITEMAP_PATH.write_text(new_text, encoding="utf-8") 423 424 425 def discover_news_files() -> list[Path]: 426 out: list[Path] = [] 427 skip = frozenset({"readme.md", "readme.markdown", "readme.txt"}) 428 for p in sorted(SCRIPT_DIR.iterdir()): 429 if not p.is_file(): 430 continue 431 if p.name.startswith("_"): 432 continue 433 lower_name = p.name.lower() 434 if lower_name in skip: 435 continue 436 if p.suffix.lower() not in SUPPORTED_NEWS_SUFFIXES: 437 continue 438 out.append(p) 439 return out 440 441 442 def main() -> int: 443 ap = argparse.ArgumentParser(description="Publica notícias a partir de .md e .txt em site/news/") 444 ap.add_argument("--dry-run", action="store_true", help="Só mostra o que faria") 445 ap.add_argument("--verbose", "-v", action="store_true") 446 ap.add_argument( 447 "--landing-document-root", 448 type=Path, 449 default=DEFAULT_LANDING_DOCUMENT_ROOT, 450 help=( 451 "DocumentRoot Apache; se existir como directório e não usar --skip-genlanding, " 452 "corre site/genlanding.py --sync-public-only após publicar" 453 ), 454 ) 455 ap.add_argument( 456 "--members-users-json", 457 type=Path, 458 default=DEFAULT_MEMBERS_USERS_JSON, 459 help="Fonte para data/members.json no genlanding (default: /var/lib/runv/users.json)", 460 ) 461 ap.add_argument( 462 "--members-homes-root", 463 type=Path, 464 default=None, 465 help="Opcional: --members-homes-root para genlanding (ex. /home)", 466 ) 467 ap.add_argument( 468 "--skip-genlanding", 469 action="store_true", 470 help="Não copiar site/public para DocumentRoot após publicar", 471 ) 472 args = ap.parse_args() 473 ensure_admin_cli( 474 script_name=Path(__file__).name, 475 dry_run=bool(args.dry_run), 476 ) 477 478 try: 479 now = datetime.now(timezone.utc).astimezone(ZoneInfo(TZ_BR)) 480 except Exception: 481 now = datetime.now(BR_FALLBACK_TZ) 482 483 news_files = discover_news_files() 484 if not news_files: 485 print("Nenhum ficheiro .md ou .txt para processar (ignore _*).", file=sys.stderr) 486 return 0 487 488 articles = load_articles() 489 pub_rfc = rfc822_date(now) 490 date_br = br_date_display(now) 491 w3c = w3c_date(now) 492 493 new_entries: list[dict[str, Any]] = [] 494 for path in news_files: 495 try: 496 title, body_source, source_kind = parse_news_file(path) 497 except ValueError as e: 498 print(f"Erro em {path.name}: {e}", file=sys.stderr) 499 return 1 500 body_html = render_body_html(body_source, source_kind=source_kind) 501 entry = { 502 "id": uuid.uuid4().hex[:12], 503 "title": title, 504 "date": date_br, 505 "body_html": body_html, 506 "pub_rfc822": pub_rfc, 507 "w3c_published": w3c, 508 } 509 new_entries.append((path, entry)) 510 if args.verbose: 511 print(f" + {path.name} -> {title!r}") 512 513 if args.dry_run: 514 print(f"[dry-run] {len(new_entries)} notícia(s); não gravou nem apagou ficheiros.") 515 return 0 516 517 for _path, entry in new_entries: 518 articles.insert(0, entry) 519 520 save_articles(articles) 521 RSS_PATH.write_text(build_rss(articles, now), encoding="utf-8") 522 523 update_sitemap_lastmod(w3c) 524 525 for path, _entry in new_entries: 526 path.unlink() 527 if args.verbose: 528 print(f" removido {path.name}") 529 530 print(f"Publicadas {len(new_entries)} notícia(s). Total: {len(articles)}.") 531 532 if not args.skip_genlanding: 533 rc = sync_landing_after_news( 534 document_root=args.landing_document_root, 535 members_users_json=args.members_users_json, 536 members_homes_root=args.members_homes_root, 537 verbose=args.verbose, 538 ) 539 if rc != 0: 540 return rc 541 542 return 0 543 544 545 if __name__ == "__main__": 546 raise SystemExit(main())