From dbda00405b6ed55511b549b51b9a0f65314941f5 Mon Sep 17 00:00:00 2001 From: fundor333 Date: Sun, 27 Jul 2025 19:35:30 +0200 Subject: [PATCH] Enhances Mastodon link handling and previews Adds functionality to identify Mastodon links in Hugo markdown files, extract relevant information like instance and ID, and generate previews. This allows for embedding Mastodon toots directly into the generated Hugo site and provides a preview text extracted from the toot content improving the user experience and content integration. It also adds a fallback in case a toot disappears to avoid broken content. Also adds `bleach` as a dependency to sanitize HTML content. --- action_script/replay-getter.py | 73 +++++++++++++++++++++++++++++++++- layouts/partials/micro.html | 12 ++++++ layouts/partials/toot.html | 63 +++++++++++++++++++++++++++++ poetry.lock | 32 ++++++++++++++- pyproject.toml | 1 + 5 files changed, 179 insertions(+), 2 deletions(-) create mode 100644 layouts/partials/toot.html diff --git a/action_script/replay-getter.py b/action_script/replay-getter.py index aae7ba81..a07dc821 100644 --- a/action_script/replay-getter.py +++ b/action_script/replay-getter.py @@ -3,10 +3,75 @@ import re import requests import frontmatter from bs4 import BeautifulSoup +from urllib.parse import urlparse # --- Funzioni di Supporto --- HUGO_CONTENT_PATH = "content" +MAX_LENGHT = 800 + + +def get_instance_and_id(url): + """ + Estrae l'istanza (hostname) e un potenziale ID da un URL, + basandosi su pattern comuni di Mastodon. + + Args: + url (str): La stringa URL da analizzare. + + Returns: + tuple: Una tupla contenente (istanza, id). + Restituisce (None, None) se l'URL non è ben formato + o se non è possibile estrarre un'istanza. + """ + parsed_url = urlparse(url) + + instance = parsed_url.netloc if parsed_url.netloc else None + + if not instance: + return None, None + + path_segments = parsed_url.path.strip("/").split("/") + + # Logica per trovare l'ID basandosi sui pattern di Mastodon + if len(path_segments) >= 2 and path_segments[0].startswith("@"): + if len(path_segments) == 2: + if path_segments[1].isdigit(): + return instance, path_segments[1] + else: + return instance, path_segments[0] + elif ( + len(path_segments) > 2 + and path_segments[1] == "statuses" + and path_segments[2].isdigit() + ): + return instance, path_segments[2] + elif len(path_segments) > 2 and path_segments[2].isdigit(): + return instance, path_segments[2] + + elif ( + len(path_segments) >= 3 + and path_segments[0] == "web" + and path_segments[1] == "statuses" + and path_segments[2].isdigit() + ): + return instance, path_segments[2] + + elif ( + len(path_segments) >= 4 + and path_segments[0] == "users" + and path_segments[2] == "statuses" + and path_segments[3].isdigit() + ): + return instance, path_segments[3] + + if path_segments: + if path_segments[-1].isdigit(): + return instance, path_segments[-1] + elif path_segments[0].startswith("@") and len(path_segments) == 1: + return instance, path_segments[0] + + return instance, None # Nessun ID specifico trovato per URL di base o generici def get_page_content(url): @@ -23,7 +88,7 @@ def get_page_content(url): return None -def extract_preview_from_html(html_content, max_length=200): +def extract_preview_from_html(html_content, max_length=MAX_LENGHT): """ Estrae una porzione di testo pulita dal contenuto HTML per una preview. Prioritizza l'estrazione da: @@ -146,6 +211,10 @@ def process_hugo_markdown_files(root_dir): if is_mastodon_link(reply_url): if post.metadata.get("mastodon_reply") is not True: post.metadata["mastodon_reply"] = True + ( + post.metadata["mastodon_instance"], + post.metadata["mastodon_id"], + ) = get_instance_and_id(reply_url) modified = True print( f" Flag 'mastodon_reply: true' aggiunto/aggiornato per {reply_url}" @@ -153,6 +222,8 @@ def process_hugo_markdown_files(root_dir): elif post.metadata.get("mastodon_reply") is True: # Se non è più un link Mastodon ma il flag era presente, rimuovilo del post.metadata["mastodon_reply"] + del post.metadata["mastodon_instance"] + del post.metadata["mastodon_id"] modified = True print( f" Flag 'mastodon_reply' rimosso per {reply_url} (non più Mastodon)." diff --git a/layouts/partials/micro.html b/layouts/partials/micro.html index 52307498..69376ea9 100644 --- a/layouts/partials/micro.html +++ b/layouts/partials/micro.html @@ -13,3 +13,15 @@ {{with .Params.Rspv}}

RSPV of {{.}}

{{end}} + +{{with .Params.Preview_text_from_reply}} +
+

{{.}}

+
+{{end}} + +{{ if .Params.mastodon_reply}} + +{{ partial "toot" . }} + +{{end}} diff --git a/layouts/partials/toot.html b/layouts/partials/toot.html new file mode 100644 index 00000000..a080be95 --- /dev/null +++ b/layouts/partials/toot.html @@ -0,0 +1,63 @@ + +{{ $masIns := .Params.mastodon_instance }} +{{ $id := .Params.mastodon_id }} +{{ $tootLink := "" }} +{{ $handleInst := "" }} +{{ $urlToGet := print "https://" $masIns "/api/v1/statuses/" $id }} + +{{ with resources.GetRemote $urlToGet }} + {{ $json := .Content | unmarshal }} + {{ if isset $json "account" }} + {{ $tootLink = print "https://" $masIns "@" $json.account.acct "/status/" $id }} + {{ $handleInst = print "@" $json.account.acct "@" $masIns }} + {{ end }} + + {{ if isset $json "content" }} +
+ + +
{{ $json.content | safeHTML }}
+ + {{ with $json.media_attachments }} + {{ $count := len . }} +
+ {{ range . }} + {{ if eq .type "image" }} +
+ +
+ {{ end }} + {{ end }} +
+ {{ end }} + + +
+ {{ end }} +{{ else }} +
+

+ [Source not online at time of site build.] +

+
+{{ end }} diff --git a/poetry.lock b/poetry.lock index cf3534dd..90c8c04f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -23,6 +23,24 @@ charset-normalizer = ["charset-normalizer"] html5lib = ["html5lib"] lxml = ["lxml"] +[[package]] +name = "bleach" +version = "6.2.0" +description = "An easy safelist-based HTML-sanitizing tool." +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "bleach-6.2.0-py3-none-any.whl", hash = "sha256:117d9c6097a7c3d22fd578fcd8d35ff1e125df6736f554da4e432fdd63f31e5e"}, + {file = "bleach-6.2.0.tar.gz", hash = "sha256:123e894118b8a599fd80d3ec1a6d4cc7ce4e5882b1317a7e1ba69b56e95f991f"}, +] + +[package.dependencies] +webencodings = "*" + +[package.extras] +css = ["tinycss2 (>=1.1.0,<1.5)"] + [[package]] name = "certifi" version = "2025.7.14" @@ -860,6 +878,18 @@ platformdirs = ">=3.9.1,<5" docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"GraalVM\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] +[[package]] +name = "webencodings" +version = "0.5.1" +description = "Character encoding aliases for legacy web content" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, + {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, +] + [[package]] name = "weeknotebot" version = "1.7.0" @@ -882,4 +912,4 @@ rich = ">=13.9.4,<14.0.0" [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "c00284e308116f498c84c3a39944727cee6c2d8f4f14439efcd8d781329329da" +content-hash = "99647e0cf2079c1607e915fffc58c2acdd357b0ad6b0c167d8ace8a943625524" diff --git a/pyproject.toml b/pyproject.toml index 67476a1c..32849e31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ pyyaml = "*" lxml = "*" typer = "*" python-frontmatter = "^1.1.0" +bleach = "^6.2.0" [tool.poetry.group.dev.dependencies]