runv-server

server tooling for runv.club
Log | Files | Refs | README

publish_news.py (18493B)


      1 #!/usr/bin/env python3
      2 """
      3 Lê ficheiros ``*.md`` e ``*.txt`` nesta pasta (``site/news/``), gera entradas em
      4 ``site/public/news/data/news.json``, ``site/public/news/feed.rss`` e
      5 actualiza ``lastmod`` da entrada ``/news/`` em ``site/public/sitemap.xml``.
      6 
      7 Formato de cada ficheiro:
      8   - Linha 1: título
      9   - Linhas seguintes: corpo
     10   - ``.md`` usa Markdown básico seguro
     11   - ``.txt`` vira texto simples com parágrafos e quebras de linha preservadas
     12 
     13 Os ficheiros processados são **apagados**. Ficheiros cujo nome começa por ``_`` são ignorados
     14 (ex.: ``_exemplo.md`` para documentação).
     15 
     16 Não versionar notícias no HTML: os dados ficam em ``news.json`` (tipicamente ignorado pelo git
     17 no servidor após gerar conteúdo local).
     18 
     19 Após publicar (sem ``--dry-run``), tenta ``site/genlanding.py --sync-public-only`` quando o
     20 DocumentRoot da landing existir (por omissão ``/var/www/runv.club/html``), para copiar
     21 ``site/public/`` para o Apache. Em produção use ``sudo``. ``--skip-genlanding`` omite esse passo.
     22 
     23 Uso::
     24     sudo python3 site/news/publish_news.py [--dry-run] [--verbose] [--skip-genlanding]
     25 """
     26 
     27 from __future__ import annotations
     28 
     29 import argparse
     30 import html
     31 import json
     32 import re
     33 import subprocess
     34 import sys
     35 import uuid
     36 from xml.sax.saxutils import escape as xml_escape
     37 from datetime import datetime, timedelta, timezone
     38 from pathlib import Path
     39 from typing import Any, Final
     40 from zoneinfo import ZoneInfo
     41 
     42 SCRIPT_DIR = Path(__file__).resolve().parent
     43 REPO_SITE = SCRIPT_DIR.parent
     44 _REPO_ROOT = REPO_SITE.parent
     45 _ADMIN_DIR = _REPO_ROOT / "scripts" / "admin"
     46 if str(_ADMIN_DIR) not in sys.path:
     47     sys.path.insert(0, str(_ADMIN_DIR))
     48 
     49 from admin_guard import ensure_admin_cli
     50 
     51 PUBLIC_NEWS = REPO_SITE / "public" / "news"
     52 DATA_DIR = PUBLIC_NEWS / "data"
     53 JSON_PATH = DATA_DIR / "news.json"
     54 RSS_PATH = PUBLIC_NEWS / "feed.rss"
     55 SITEMAP_PATH = REPO_SITE / "public" / "sitemap.xml"
     56 
     57 TZ_BR: Final[str] = "America/Sao_Paulo"
     58 # Brasil sem DST: fallback se ``tzdata`` não estiver instalado (ex.: Windows minimal).
     59 BR_FALLBACK_TZ = timezone(timedelta(hours=-3))
     60 SITE_URL: Final[str] = "https://runv.club"
     61 DEFAULT_LANDING_DOCUMENT_ROOT: Final[Path] = Path("/var/www/runv.club/html")
     62 DEFAULT_MEMBERS_USERS_JSON: Final[Path] = Path("/var/lib/runv/users.json")
     63 SUPPORTED_NEWS_SUFFIXES: Final[tuple[str, ...]] = (".md", ".txt")
     64 _CODE_PLACEHOLDER_RE: Final[re.Pattern[str]] = re.compile(r"\x00CODE(\d+)\x00")
     65 _LINK_RE: Final[re.Pattern[str]] = re.compile(r"\[([^\]\n]+)\]\(([^)\s]+)\)")
     66 _BOLD_RE: Final[re.Pattern[str]] = re.compile(r"(?<!\*)\*\*([^\n*][\s\S]*?[^\n*])\*\*(?!\*)")
     67 _UNDERLINE_RE: Final[re.Pattern[str]] = re.compile(r"\+\+([^\n+][\s\S]*?[^\n+])\+\+")
     68 _ITALIC_STAR_RE: Final[re.Pattern[str]] = re.compile(r"(?<!\*)\*([^\s*][^*\n]*?[^\s*])\*(?!\*)")
     69 _ITALIC_UNDERSCORE_RE: Final[re.Pattern[str]] = re.compile(r"(?<!_)_([^\s_][^_\n]*?[^\s_])_(?!_)")
     70 _INLINE_CODE_RE: Final[re.Pattern[str]] = re.compile(r"`([^`\n]+)`")
     71 _SAFE_URL_RE: Final[re.Pattern[str]] = re.compile(
     72     r"^(?:https?://|mailto:|/|#|\.{1,2}/)[^\s]*$",
     73     re.IGNORECASE,
     74 )
     75 
     76 
     77 def sync_landing_after_news(
     78     *,
     79     document_root: Path,
     80     members_users_json: Path,
     81     members_homes_root: Path | None,
     82     verbose: bool,
     83 ) -> int:
     84     """
     85     Copia site/public → DocumentRoot via genlanding --sync-public-only.
     86     Devolve 0 se omitido (sem script / sem DocumentRoot) ou sync OK; 1 se genlanding falhou.
     87     """
     88     gl = _REPO_ROOT / "site" / "genlanding.py"
     89     if not gl.is_file():
     90         print(
     91             f"AVISO: genlanding.py não encontrado em {gl}; não sincronizou DocumentRoot.",
     92             file=sys.stderr,
     93         )
     94         return 0
     95     root = document_root.resolve()
     96     if not root.is_dir():
     97         homes_opt = ""
     98         if members_homes_root is not None:
     99             homes_opt = f" --members-homes-root {members_homes_root.resolve()}"
    100         print(
    101             f"AVISO: DocumentRoot da landing inexistente ({root}) — site/public não foi copiado para Apache.\n"
    102             f"Manual: sudo python3 {_REPO_ROOT / 'site' / 'genlanding.py'} --sync-public-only "
    103             f"--document-root {root} --members-users-json {members_users_json}{homes_opt}",
    104             file=sys.stderr,
    105         )
    106         return 0
    107     admin = _REPO_ROOT / "scripts" / "admin"
    108     if str(admin) not in sys.path:
    109         sys.path.insert(0, str(admin))
    110     from runv_landing_sync import genlanding_sync_command
    111 
    112     cmd = genlanding_sync_command(
    113         document_root=root,
    114         users_json=members_users_json.resolve(),
    115         homes_root=members_homes_root.resolve() if members_homes_root else None,
    116     )
    117     if verbose:
    118         print(f"  $ {' '.join(cmd)}")
    119     r = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
    120     if r.returncode == 0:
    121         print(f"Landing sincronizada (public + members) → {root}")
    122         return 0
    123     combined = ((r.stdout or "") + "\n" + (r.stderr or "")).strip()
    124     print(
    125         f"Erro: genlanding --sync-public-only terminou com código {r.returncode}.",
    126         file=sys.stderr,
    127     )
    128     if combined:
    129         print(combined[:4000], file=sys.stderr)
    130     return 1
    131 
    132 
    133 def _preserve_code_span(html_text: str, code_segments: list[str]) -> str:
    134     def repl(match: re.Match[str]) -> str:
    135         idx = len(code_segments)
    136         code_segments.append(f"<code>{html.escape(match.group(1))}</code>")
    137         return f"\x00CODE{idx}\x00"
    138 
    139     return _INLINE_CODE_RE.sub(repl, html_text)
    140 
    141 
    142 def _restore_code_span(html_text: str, code_segments: list[str]) -> str:
    143     return _CODE_PLACEHOLDER_RE.sub(
    144         lambda m: code_segments[int(m.group(1))],
    145         html_text,
    146     )
    147 
    148 
    149 def _safe_href(url: str) -> str | None:
    150     if not _SAFE_URL_RE.fullmatch(url):
    151         return None
    152     if url.lower().startswith("javascript:"):
    153         return None
    154     return html.escape(url, quote=True)
    155 
    156 
    157 def inline_markdown_to_html(text: str) -> str:
    158     escaped = html.escape(text)
    159     code_segments: list[str] = []
    160     escaped = _preserve_code_span(escaped, code_segments)
    161 
    162     def repl_link(match: re.Match[str]) -> str:
    163         label = inline_markdown_to_html(match.group(1))
    164         href = _safe_href(match.group(2).strip())
    165         if href is None:
    166             return html.escape(match.group(0))
    167         return f'<a href="{href}">{label}</a>'
    168 
    169     escaped = _LINK_RE.sub(repl_link, escaped)
    170     escaped = _BOLD_RE.sub(lambda m: f"<strong>{m.group(1)}</strong>", escaped)
    171     escaped = _UNDERLINE_RE.sub(lambda m: f"<u>{m.group(1)}</u>", escaped)
    172     escaped = _ITALIC_STAR_RE.sub(lambda m: f"<em>{m.group(1)}</em>", escaped)
    173     escaped = _ITALIC_UNDERSCORE_RE.sub(lambda m: f"<em>{m.group(1)}</em>", escaped)
    174     return _restore_code_span(escaped, code_segments)
    175 
    176 
    177 def render_plain_text_html(body: str) -> str:
    178     body = body.replace("\r\n", "\n").strip()
    179     if not body:
    180         return ""
    181     blocks = re.split(r"\n\s*\n+", body)
    182     parts: list[str] = []
    183     for block in blocks:
    184         lines = [html.escape(line.rstrip()) for line in block.split("\n")]
    185         parts.append(f"<p>{'<br>\n'.join(lines)}</p>")
    186     return "\n".join(parts)
    187 
    188 
    189 def render_markdown_html(body: str) -> str:
    190     body = body.replace("\r\n", "\n").strip()
    191     if not body:
    192         return ""
    193 
    194     lines = body.split("\n")
    195     parts: list[str] = []
    196     paragraph_lines: list[str] = []
    197     list_type: str | None = None
    198     list_items: list[str] = []
    199     quote_lines: list[str] = []
    200     fence_lang = ""
    201     code_lines: list[str] = []
    202 
    203     def flush_paragraph() -> None:
    204         nonlocal paragraph_lines
    205         if not paragraph_lines:
    206             return
    207         text = " ".join(line.strip() for line in paragraph_lines if line.strip())
    208         parts.append(f"<p>{inline_markdown_to_html(text)}</p>")
    209         paragraph_lines = []
    210 
    211     def flush_list() -> None:
    212         nonlocal list_type, list_items
    213         if not list_items or not list_type:
    214             return
    215         tag = "ol" if list_type == "ol" else "ul"
    216         items = "".join(f"<li>{inline_markdown_to_html(item)}</li>" for item in list_items)
    217         parts.append(f"<{tag}>{items}</{tag}>")
    218         list_type = None
    219         list_items = []
    220 
    221     def flush_quote() -> None:
    222         nonlocal quote_lines
    223         if not quote_lines:
    224             return
    225         quote_html = render_markdown_html("\n".join(quote_lines))
    226         parts.append(f"<blockquote>{quote_html}</blockquote>")
    227         quote_lines = []
    228 
    229     def flush_code() -> None:
    230         nonlocal code_lines, fence_lang
    231         code = "\n".join(code_lines)
    232         lang_attr = ""
    233         if fence_lang:
    234             lang_attr = f' class="language-{html.escape(fence_lang, quote=True)}"'
    235         parts.append(f"<pre><code{lang_attr}>{html.escape(code)}</code></pre>")
    236         code_lines = []
    237         fence_lang = ""
    238 
    239     def flush_all() -> None:
    240         flush_paragraph()
    241         flush_list()
    242         flush_quote()
    243 
    244     in_code = False
    245     for raw_line in lines:
    246         line = raw_line.rstrip()
    247         stripped = line.strip()
    248 
    249         if in_code:
    250             if stripped.startswith("```"):
    251                 flush_code()
    252                 in_code = False
    253             else:
    254                 code_lines.append(raw_line)
    255             continue
    256 
    257         if stripped.startswith("```"):
    258             flush_all()
    259             in_code = True
    260             fence_lang = stripped[3:].strip()
    261             code_lines = []
    262             continue
    263 
    264         if not stripped:
    265             flush_all()
    266             continue
    267 
    268         quote_match = re.match(r"^\s*>\s?(.*)$", line)
    269         if quote_match:
    270             flush_paragraph()
    271             flush_list()
    272             quote_lines.append(quote_match.group(1))
    273             continue
    274         flush_quote()
    275 
    276         heading_match = re.match(r"^(#{1,6})\s+(.+?)\s*$", stripped)
    277         if heading_match:
    278             flush_all()
    279             level = len(heading_match.group(1))
    280             text = inline_markdown_to_html(heading_match.group(2))
    281             parts.append(f"<h{level}>{text}</h{level}>")
    282             continue
    283 
    284         if re.fullmatch(r"(?:-{3,}|\*{3,}|_{3,})", stripped):
    285             flush_all()
    286             parts.append("<hr>")
    287             continue
    288 
    289         ul_match = re.match(r"^\s*[-*+]\s+(.+)$", line)
    290         if ul_match:
    291             flush_paragraph()
    292             if list_type not in (None, "ul"):
    293                 flush_list()
    294             list_type = "ul"
    295             list_items.append(ul_match.group(1).strip())
    296             continue
    297 
    298         ol_match = re.match(r"^\s*\d+\.\s+(.+)$", line)
    299         if ol_match:
    300             flush_paragraph()
    301             if list_type not in (None, "ol"):
    302                 flush_list()
    303             list_type = "ol"
    304             list_items.append(ol_match.group(1).strip())
    305             continue
    306 
    307         flush_list()
    308         paragraph_lines.append(line)
    309 
    310     if in_code:
    311         flush_code()
    312     flush_all()
    313     return "\n".join(parts)
    314 
    315 
    316 def render_body_html(body: str, *, source_kind: str) -> str:
    317     if source_kind == "txt":
    318         return render_plain_text_html(body)
    319     return render_markdown_html(body)
    320 
    321 
    322 def parse_news_file(path: Path) -> tuple[str, str, str]:
    323     raw = path.read_text(encoding="utf-8")
    324     lines = raw.splitlines()
    325     if not lines:
    326         raise ValueError(f"{path.name}: ficheiro vazio")
    327     title_line = lines[0].strip()
    328     title = re.sub(r"^#\s+", "", title_line).strip() if path.suffix.lower() == ".md" else title_line
    329     if not title:
    330         raise ValueError(f"{path.name}: primeira linha (título) vazia")
    331     body = "\n".join(lines[1:]).lstrip("\n")
    332     return title, body, path.suffix.lower().lstrip(".")
    333 
    334 
    335 def load_articles() -> list[dict[str, Any]]:
    336     if not JSON_PATH.is_file():
    337         return []
    338     data = json.loads(JSON_PATH.read_text(encoding="utf-8"))
    339     arts = data.get("articles")
    340     if not isinstance(arts, list):
    341         return []
    342     return arts
    343 
    344 
    345 def save_articles(articles: list[dict[str, Any]]) -> None:
    346     DATA_DIR.mkdir(parents=True, exist_ok=True)
    347     JSON_PATH.write_text(
    348         json.dumps({"articles": articles}, ensure_ascii=False, indent=2) + "\n",
    349         encoding="utf-8",
    350     )
    351 
    352 
    353 def br_date_display(now: datetime) -> str:
    354     return now.strftime("%d-%m-%Y")
    355 
    356 
    357 def rfc822_date(now: datetime) -> str:
    358     """RFC 822 / RSS pubDate (locale inglês para dia da semana)."""
    359     return now.strftime("%a, %d %b %Y %H:%M:%S %z")
    360 
    361 
    362 def w3c_date(now: datetime) -> str:
    363     if now.tzinfo is None:
    364         now = now.replace(tzinfo=timezone.utc)
    365     return now.isoformat(timespec="seconds")
    366 
    367 
    368 def build_rss(articles: list[dict[str, Any]], now: datetime) -> str:
    369     """RSS 2.0; descriptions em CDATA com HTML seguro gerado pelo script."""
    370     channel_parts = [
    371         '<?xml version="1.0" encoding="UTF-8"?>',
    372         '<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">',
    373         "<channel>",
    374         f"<title>Notícias — runv.club</title>",
    375         f"<link>{SITE_URL}/news/</link>",
    376         "<description>Comunicados e atualizações da comunidade runv.club</description>",
    377         f"<language>pt-BR</language>",
    378         f"<lastBuildDate>{rfc822_date(now)}</lastBuildDate>",
    379         f'<atom:link href="{SITE_URL}/news/feed.rss" rel="self" type="application/rss+xml"/>',
    380     ]
    381     for art in articles[:50]:
    382         title = xml_escape(str(art["title"]))
    383         aid = xml_escape(str(art["id"]))
    384         link = f"{SITE_URL}/news/#{aid}"
    385         pub = art.get("pub_rfc822") or rfc822_date(now)
    386         body = art.get("body_html") or ""
    387         desc = f"<![CDATA[{body}]]>"
    388         channel_parts.extend(
    389             [
    390                 "<item>",
    391                 f"<title>{title}</title>",
    392                 f"<link>{link}</link>",
    393                 f"<guid isPermaLink=\"false\">{SITE_URL}/news/item-{aid}</guid>",
    394                 f"<pubDate>{pub}</pubDate>",
    395                 f"<description>{desc}</description>",
    396                 "</item>",
    397             ]
    398         )
    399     channel_parts.extend(["</channel>", "</rss>"])
    400     return "\n".join(channel_parts) + "\n"
    401 
    402 
    403 def update_sitemap_lastmod(news_lastmod: str) -> None:
    404     """Actualiza ou insere ``<lastmod>`` só no URL ``/news/``, sem reescrever prefixos XML."""
    405     if not SITEMAP_PATH.is_file():
    406         return
    407     text = SITEMAP_PATH.read_text(encoding="utf-8")
    408     news_loc = f"<loc>{SITE_URL}/news/</loc>"
    409     if news_loc not in text:
    410         return
    411     lastmod_tag = f"<lastmod>{news_lastmod}</lastmod>"
    412     block_re = re.compile(
    413         rf"(\s*<url>\s*{re.escape(news_loc)})(\s*<lastmod>[^<]*</lastmod>)?(\s*</url>)",
    414         re.DOTALL,
    415     )
    416 
    417     def repl(m: re.Match[str]) -> str:
    418         return f"{m.group(1)}\n    {lastmod_tag}{m.group(3)}"
    419 
    420     new_text, n = block_re.subn(repl, text, count=1)
    421     if n:
    422         SITEMAP_PATH.write_text(new_text, encoding="utf-8")
    423 
    424 
    425 def discover_news_files() -> list[Path]:
    426     out: list[Path] = []
    427     skip = frozenset({"readme.md", "readme.markdown", "readme.txt"})
    428     for p in sorted(SCRIPT_DIR.iterdir()):
    429         if not p.is_file():
    430             continue
    431         if p.name.startswith("_"):
    432             continue
    433         lower_name = p.name.lower()
    434         if lower_name in skip:
    435             continue
    436         if p.suffix.lower() not in SUPPORTED_NEWS_SUFFIXES:
    437             continue
    438         out.append(p)
    439     return out
    440 
    441 
    442 def main() -> int:
    443     ap = argparse.ArgumentParser(description="Publica notícias a partir de .md e .txt em site/news/")
    444     ap.add_argument("--dry-run", action="store_true", help="Só mostra o que faria")
    445     ap.add_argument("--verbose", "-v", action="store_true")
    446     ap.add_argument(
    447         "--landing-document-root",
    448         type=Path,
    449         default=DEFAULT_LANDING_DOCUMENT_ROOT,
    450         help=(
    451             "DocumentRoot Apache; se existir como directório e não usar --skip-genlanding, "
    452             "corre site/genlanding.py --sync-public-only após publicar"
    453         ),
    454     )
    455     ap.add_argument(
    456         "--members-users-json",
    457         type=Path,
    458         default=DEFAULT_MEMBERS_USERS_JSON,
    459         help="Fonte para data/members.json no genlanding (default: /var/lib/runv/users.json)",
    460     )
    461     ap.add_argument(
    462         "--members-homes-root",
    463         type=Path,
    464         default=None,
    465         help="Opcional: --members-homes-root para genlanding (ex. /home)",
    466     )
    467     ap.add_argument(
    468         "--skip-genlanding",
    469         action="store_true",
    470         help="Não copiar site/public para DocumentRoot após publicar",
    471     )
    472     args = ap.parse_args()
    473     ensure_admin_cli(
    474         script_name=Path(__file__).name,
    475         dry_run=bool(args.dry_run),
    476     )
    477 
    478     try:
    479         now = datetime.now(timezone.utc).astimezone(ZoneInfo(TZ_BR))
    480     except Exception:
    481         now = datetime.now(BR_FALLBACK_TZ)
    482 
    483     news_files = discover_news_files()
    484     if not news_files:
    485         print("Nenhum ficheiro .md ou .txt para processar (ignore _*).", file=sys.stderr)
    486         return 0
    487 
    488     articles = load_articles()
    489     pub_rfc = rfc822_date(now)
    490     date_br = br_date_display(now)
    491     w3c = w3c_date(now)
    492 
    493     new_entries: list[dict[str, Any]] = []
    494     for path in news_files:
    495         try:
    496             title, body_source, source_kind = parse_news_file(path)
    497         except ValueError as e:
    498             print(f"Erro em {path.name}: {e}", file=sys.stderr)
    499             return 1
    500         body_html = render_body_html(body_source, source_kind=source_kind)
    501         entry = {
    502             "id": uuid.uuid4().hex[:12],
    503             "title": title,
    504             "date": date_br,
    505             "body_html": body_html,
    506             "pub_rfc822": pub_rfc,
    507             "w3c_published": w3c,
    508         }
    509         new_entries.append((path, entry))
    510         if args.verbose:
    511             print(f"  + {path.name} -> {title!r}")
    512 
    513     if args.dry_run:
    514         print(f"[dry-run] {len(new_entries)} notícia(s); não gravou nem apagou ficheiros.")
    515         return 0
    516 
    517     for _path, entry in new_entries:
    518         articles.insert(0, entry)
    519 
    520     save_articles(articles)
    521     RSS_PATH.write_text(build_rss(articles, now), encoding="utf-8")
    522 
    523     update_sitemap_lastmod(w3c)
    524 
    525     for path, _entry in new_entries:
    526         path.unlink()
    527         if args.verbose:
    528             print(f"  removido {path.name}")
    529 
    530     print(f"Publicadas {len(new_entries)} notícia(s). Total: {len(articles)}.")
    531 
    532     if not args.skip_genlanding:
    533         rc = sync_landing_after_news(
    534             document_root=args.landing_document_root,
    535             members_users_json=args.members_users_json,
    536             members_homes_root=args.members_homes_root,
    537             verbose=args.verbose,
    538         )
    539         if rc != 0:
    540             return rc
    541 
    542     return 0
    543 
    544 
    545 if __name__ == "__main__":
    546     raise SystemExit(main())